vp8: [loongson] optimize loopfilter v2.

Optimize function vp8_mbloop_filter_vertical_edge_mmi and
function vp8_mbloop_filter_horizontal_edge_mmi.
Make full use of memory loading delay slot and reduce unnecessary
instructions.

Change-Id: I61da2c3a44c06044225461f46bf487d83cba6c16
diff --git a/vp8/common/mips/mmi/loopfilter_filters_mmi.c b/vp8/common/mips/mmi/loopfilter_filters_mmi.c
index 026e575..f2182f9 100644
--- a/vp8/common/mips/mmi/loopfilter_filters_mmi.c
+++ b/vp8/common/mips/mmi/loopfilter_filters_mmi.c
@@ -461,96 +461,87 @@
   );
 }
 
+/* clang-format off */
 #define VP8_MBLOOP_HPSRAB                                               \
-  "xor        %[ftmp3],   %[ftmp3],           %[ftmp3]            \n\t" \
-  "xor        %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t" \
-  "punpcklbh  %[ftmp3],   %[ftmp3],           %[ftmp0]            \n\t" \
-  "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp0]            \n\t" \
-  "psrah      %[ftmp3],   %[ftmp3],           %[ftmp9]            \n\t" \
-  "psrah      %[ftmp8],   %[ftmp8],           %[ftmp9]            \n\t" \
-  "packsshb   %[ftmp0],   %[ftmp3],           %[ftmp8]            \n\t"
+  "punpcklbh  %[ftmp10],  %[ftmp10],          %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp0]            \n\t" \
+  "psrah      %[ftmp10],  %[ftmp10],          %[ftmp9]            \n\t" \
+  "psrah      %[ftmp11],  %[ftmp11],          %[ftmp9]            \n\t" \
+  "packsshb   %[ftmp0],   %[ftmp10],          %[ftmp11]            \n\t"
 
-#define VP8_MBLOOP_HPSRAB_PMULHH(reg1, reg2) \
-  "pmulhh   " #reg1 ",  " #reg1 ",  " #reg2 "                     \n\t"
-
-#define VP8_MBLOOP_HPSRAB_ADD(reg) \
-  "xor        %[ftmp3],   %[ftmp3],           %[ftmp3]            \n\t" \
-  "xor        %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t" \
-  "punpcklbh  %[ftmp3],   %[ftmp3],           %[ftmp2]            \n\t" \
-  "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp2]            \n\t" \
-  VP8_MBLOOP_HPSRAB_PMULHH(%[ftmp3], reg)                               \
-  VP8_MBLOOP_HPSRAB_PMULHH(%[ftmp8], reg)                               \
-  "paddh      %[ftmp3],   %[ftmp3],           %[ff_ph_003f]       \n\t" \
-  "paddh      %[ftmp8],   %[ftmp8],           %[ff_ph_003f]       \n\t" \
-  "psrah      %[ftmp3],   %[ftmp3],           %[ftmp9]            \n\t" \
-  "psrah      %[ftmp8],   %[ftmp8],           %[ftmp9]            \n\t" \
-  "packsshb   %[ftmp3],   %[ftmp3],           %[ftmp8]            \n\t"
+#define VP8_MBLOOP_HPSRAB_ADD(reg)                                      \
+  "punpcklbh  %[ftmp1],   %[ftmp0],           %[ftmp12]           \n\t" \
+  "punpckhbh  %[ftmp2],   %[ftmp0],           %[ftmp12]           \n\t" \
+  "pmulhh     %[ftmp1],   %[ftmp1],         " #reg "              \n\t" \
+  "pmulhh     %[ftmp2],   %[ftmp2],         " #reg "              \n\t" \
+  "paddh      %[ftmp1],   %[ftmp1],           %[ff_ph_003f]       \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],           %[ff_ph_003f]       \n\t" \
+  "psrah      %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t" \
+  "psrah      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t" \
+  "packsshb   %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+/* clang-format on */
 
 void vp8_mbloop_filter_horizontal_edge_mmi(
     unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
     const unsigned char *limit, const unsigned char *thresh, int count) {
   uint32_t tmp[1];
-  mips_reg addr[2];
-  DECLARE_ALIGNED(8, const uint64_t, srct[1]);
-  double ftmp[10];
+  double ftmp[13];
 
   __asm__ volatile (
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
     "1:                                                             \n\t"
     "gsldlc1    %[ftmp9],   0x07(%[limit])                          \n\t"
     "gsldrc1    %[ftmp9],   0x00(%[limit])                          \n\t"
+    /* ftmp1: p3 */
+    "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp3: p2 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp4: p1 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp4],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp5: p0 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp6: q0 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp7: q1 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp8: q2 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp2: q3 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp2],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[src_ptr])                        \n\t"
 
-    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp12],  0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[blimit])                         \n\t"
 
-    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
-    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
-    "gsldlc1    %[ftmp1],   0x07(%[addr1])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x00(%[addr1])                          \n\t"
-    MMI_SUBU(%[addr1], %[addr0], %[tmp0])
-    "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
-    "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
     "pasubub    %[ftmp0],   %[ftmp1],           %[ftmp3]            \n\t"
     "psubusb    %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
-
-    /* ftmp4:p1 */
-    MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
-    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
-    "gsldlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
-    "gsldrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
     "pasubub    %[ftmp1],   %[ftmp3],           %[ftmp4]            \n\t"
     "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
     "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
-
-    /* ftmp5:p0 */
-    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
-    "gsldlc1    %[ftmp5],   0x07(%[addr1])                          \n\t"
-    "gsldrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
-    "pasubub    %[ftmp1],   %[ftmp4],           %[ftmp5]            \n\t"
-    "sdc1       %[ftmp1],   0x00(%[srct])                           \n\t"
-    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "pasubub    %[ftmp10],  %[ftmp4],           %[ftmp5]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp10],          %[ftmp9]            \n\t"
     "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
-
-    /* ftmp6:q0 */
-    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
-    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
-
-    /* ftmp7:q1 */
-    "gsldlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
-    "pasubub    %[ftmp1],   %[ftmp7],           %[ftmp6]            \n\t"
-    "sdc1       %[ftmp1],   0x08(%[srct])                           \n\t"
-    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "pasubub    %[ftmp11],  %[ftmp7],           %[ftmp6]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp11],          %[ftmp9]            \n\t"
     "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
-
-    MMI_ADDU(%[addr1], %[src_ptr], %[tmp0])
-    "gsldlc1    %[ftmp8],   0x07(%[addr1])                          \n\t"
-    "gsldrc1    %[ftmp8],   0x00(%[addr1])                          \n\t"
     "pasubub    %[ftmp1],   %[ftmp8],           %[ftmp7]            \n\t"
     "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
     "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
-
-    MMI_ADDU(%[addr1], %[addr0], %[tmp0])
-    "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
     "pasubub    %[ftmp1],   %[ftmp2],           %[ftmp8]            \n\t"
     "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
     "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
@@ -563,9 +554,7 @@
     "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
     "psrlh      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
     "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
-    "gsldlc1    %[ftmp9],   0x07(%[blimit])                         \n\t"
-    "gsldrc1    %[ftmp9],   0x00(%[blimit])                         \n\t"
-    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp12]           \n\t"
     "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
     "xor        %[ftmp9],   %[ftmp9],           %[ftmp9]            \n\t"
     /* ftmp0: mask */
@@ -573,29 +562,26 @@
 
     "gsldlc1    %[ftmp9],   0x07(%[thresh])                         \n\t"
     "gsldrc1    %[ftmp9],   0x00(%[thresh])                         \n\t"
-    "ldc1       %[ftmp1],   0x00(%[srct])                           \n\t"
-    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
-    "ldc1       %[ftmp2],   0x08(%[srct])                           \n\t"
-    "psubusb    %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp10],          %[ftmp9]            \n\t"
+    "psubusb    %[ftmp2],   %[ftmp11],          %[ftmp9]            \n\t"
     "paddb      %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
     "xor        %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
     "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
     "pcmpeqb    %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
-    /* ftmp1:hev*/
+    /* ftmp1: hev */
     "xor        %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
 
     "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
     "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
     "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
     "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
-
     "psubsb     %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
     "psubsb     %[ftmp9],   %[ftmp6],           %[ftmp5]            \n\t"
     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
     "and        %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
-    "sdc1       %[ftmp2],   0x00(%[srct])                           \n\t"
+    "pandn      %[ftmp12],  %[ftmp1],           %[ftmp2]            \n\t"
     "and        %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
 
     "li         %[tmp0],    0x0b                                    \n\t"
@@ -606,75 +592,71 @@
     "paddsb     %[ftmp0],   %[ftmp2],           %[ff_pb_04]         \n\t"
     VP8_MBLOOP_HPSRAB
     "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
-    "ldc1       %[ftmp2],   0x00(%[srct])                           \n\t"
-    "pandn      %[ftmp2],   %[ftmp1],           %[ftmp2]            \n\t"
 
     "li         %[tmp0],    0x07                                    \n\t"
     "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
+
     VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00])
-    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp3]            \n\t"
-    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp3]            \n\t"
+    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp1]            \n\t"
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp1]            \n\t"
     "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
     "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
-
-    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
-    "gssdlc1    %[ftmp5],   0x07(%[addr1])                          \n\t"
-    "gssdrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    "gssdlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
     "gssdlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
     "gssdrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+
     VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1200])
-    "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp3]            \n\t"
-    "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp3]            \n\t"
+    "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
     "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
     "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
-
-    "gssdlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
-    MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
-    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
-    "gssdlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
-    "gssdrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp4],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"
 
     VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_0900])
-    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
-    MMI_SUBU(%[addr1], %[addr0], %[tmp0])
-    "gsldlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
-    "gsldrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
-    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step])
-    "gsldlc1    %[ftmp7],   0x07(%[addr1])                          \n\t"
-    "gsldrc1    %[ftmp7],   0x00(%[addr1])                          \n\t"
+    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp8],   %[ftmp8],           %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
+    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp8],   %[ftmp8],           %[ff_pb_80]         \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+    "gssdlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"
 
-    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
-    "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp3]            \n\t"
-    "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp3]            \n\t"
-    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
-    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step])
-    "gssdlc1    %[ftmp7],   0x07(%[addr1])                          \n\t"
-    "gssdrc1    %[ftmp7],   0x00(%[addr1])                          \n\t"
-    MMI_SUBU(%[addr1], %[addr0], %[tmp0])
-    "gssdlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
-    "gssdrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
-
-    "addiu      %[count],   %[count],           -0x01               \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
     MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+    "addiu      %[count],   %[count],           -0x01               \n\t"
     "bnez       %[count],   1b                                      \n\t"
     : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
       [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
       [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
       [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
       [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
-      [tmp0]"=&r"(tmp[0]),
-      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
-    : [limit]"r"(limit),                [blimit]"r"(blimit),
-      [srct]"r"(srct),                  [thresh]"r"(thresh),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
+      [src_ptr]"+&r"(src_ptr),            [count]"+&r"(count)
+    : [limit]"r"(limit),                  [blimit]"r"(blimit),
+      [thresh]"r"(thresh),
       [src_pixel_step]"r"((mips_reg)src_pixel_step),
-      [ff_pb_fe]"f"(ff_pb_fe),          [ff_pb_80]"f"(ff_pb_80),
-      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_03]"f"(ff_pb_03),
-      [ff_ph_0900]"f"(ff_ph_0900),      [ff_ph_1b00]"f"(ff_ph_1b00),
-      [ff_ph_1200]"f"(ff_ph_1200),      [ff_ph_003f]"f"(ff_ph_003f)
+      [ff_pb_fe]"f"(ff_pb_fe),            [ff_pb_80]"f"(ff_pb_80),
+      [ff_pb_04]"f"(ff_pb_04),            [ff_pb_03]"f"(ff_pb_03),
+      [ff_ph_0900]"f"(ff_ph_0900),        [ff_ph_1b00]"f"(ff_ph_1b00),
+      [ff_ph_1200]"f"(ff_ph_1200),        [ff_ph_003f]"f"(ff_ph_003f)
     : "memory"
   );
 }
@@ -696,64 +678,60 @@
     unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
     const unsigned char *limit, const unsigned char *thresh, int count) {
   mips_reg tmp[1];
-  mips_reg addr[2];
   DECLARE_ALIGNED(8, const uint64_t, srct[1]);
-  double ftmp[13];
+  double ftmp[14];
 
   __asm__ volatile (
-    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
-    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
     MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
 
     "1:                                                             \n\t"
-    MMI_SLL (%[tmp0], %[src_pixel_step], 0x01)
-    MMI_ADDU(%[addr0], %[src_ptr], %[tmp0])
-    "gsldlc1    %[ftmp11],  0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp11],  0x00(%[addr0])                          \n\t"
-    MMI_ADDU(%[addr0], %[addr0], %[src_pixel_step])
-    "gsldlc1    %[ftmp12],  0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp12],  0x00(%[addr0])                          \n\t"
-    "punpcklbh  %[ftmp1],   %[ftmp11],          %[ftmp12]           \n\t"
-    "punpckhbh  %[ftmp2],   %[ftmp11],          %[ftmp12]           \n\t"
+    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
 
-    "gsldlc1    %[ftmp11],  0x07(%[src_ptr])                        \n\t"
-    "gsldrc1    %[ftmp11],  0x00(%[src_ptr])                        \n\t"
-    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
-    "gsldlc1    %[ftmp12],  0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp12],  0x00(%[addr0])                          \n\t"
-    "punpcklbh  %[ftmp3],   %[ftmp11],          %[ftmp12]           \n\t"
-    "punpckhbh  %[ftmp4],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpcklbh  %[ftmp11],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpckhbh  %[ftmp12],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp8]            \n\t"
+    "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp8]            \n\t"
 
-    "punpcklhw  %[ftmp5],   %[ftmp4],           %[ftmp2]            \n\t"
-    "punpckhhw  %[ftmp6],   %[ftmp4],           %[ftmp2]            \n\t"
-    "punpcklhw  %[ftmp7],   %[ftmp3],           %[ftmp1]            \n\t"
-    "punpckhhw  %[ftmp8],   %[ftmp3],           %[ftmp1]            \n\t"
+    "punpcklhw  %[ftmp1],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp2],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpcklhw  %[ftmp3],   %[ftmp11],          %[ftmp9]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp11],          %[ftmp9]            \n\t"
 
-    MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
-    MMI_SUBU(%[addr0], %[src_ptr], %[tmp0])
-    "gsldlc1    %[ftmp11],  0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp11],  0x00(%[addr0])                          \n\t"
-    MMI_SUBU(%[addr0], %[src_ptr], %[src_pixel_step])
-    "gsldlc1    %[ftmp12],  0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp12],  0x00(%[addr0])                          \n\t"
-    "punpcklbh  %[ftmp9],   %[ftmp11],          %[ftmp12]           \n\t"
-    "punpckhbh  %[ftmp10],  %[ftmp11],          %[ftmp12]           \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
 
-    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
-    MMI_SUBU(%[addr0], %[src_ptr], %[tmp0])
-    "gsldlc1    %[ftmp11],  0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp11],  0x00(%[addr0])                          \n\t"
-    MMI_ADDU(%[addr0], %[addr0], %[src_pixel_step])
-    "gsldlc1    %[ftmp12],  0x07(%[addr0])                          \n\t"
-    "gsldrc1    %[ftmp12],  0x00(%[addr0])                          \n\t"
-    "punpcklbh  %[ftmp0],   %[ftmp11],          %[ftmp12]           \n\t"
-    "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp12]           \n\t"
+    "punpcklbh  %[ftmp11],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpckhbh  %[ftmp12],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp8]            \n\t"
+    "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp8]            \n\t"
 
-    "punpcklhw  %[ftmp1],   %[ftmp11],          %[ftmp10]           \n\t"
-    "punpckhhw  %[ftmp2],   %[ftmp11],          %[ftmp10]           \n\t"
-    "punpcklhw  %[ftmp3],   %[ftmp0],           %[ftmp9]            \n\t"
-    "punpckhhw  %[ftmp4],   %[ftmp0],           %[ftmp9]            \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpcklhw  %[ftmp7],   %[ftmp11],          %[ftmp9]            \n\t"
+    "punpckhhw  %[ftmp8],   %[ftmp11],          %[ftmp9]            \n\t"
 
+    "gsldlc1    %[ftmp13],  0x07(%[limit])                          \n\t"
+    "gsldrc1    %[ftmp13],  0x00(%[limit])                          \n\t"
     /* ftmp9:q0  ftmp10:q1 */
     "punpcklwd  %[ftmp9],   %[ftmp1],           %[ftmp5]            \n\t"
     "punpckhwd  %[ftmp10],  %[ftmp1],           %[ftmp5]            \n\t"
@@ -771,60 +749,61 @@
     "punpcklwd  %[ftmp5],   %[ftmp4],           %[ftmp8]            \n\t"
     "punpckhwd  %[ftmp6],   %[ftmp4],           %[ftmp8]            \n\t"
 
-    "gsldlc1    %[ftmp8],   0x07(%[limit])                          \n\t"
-    "gsldrc1    %[ftmp8],   0x00(%[limit])                          \n\t"
-
     /* abs (q3-q2) */
     "pasubub    %[ftmp7],   %[ftmp12],          %[ftmp11]           \n\t"
-    "psubusb    %[ftmp0],   %[ftmp7],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp0],   %[ftmp7],           %[ftmp13]           \n\t"
     /* abs (q2-q1) */
     "pasubub    %[ftmp7],   %[ftmp11],          %[ftmp10]           \n\t"
-    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
     "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
     /* ftmp3: abs(q1-q0) */
     "pasubub    %[ftmp3],   %[ftmp10],          %[ftmp9]            \n\t"
-    "psubusb    %[ftmp7],   %[ftmp3],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp3],           %[ftmp13]           \n\t"
     "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
     /* ftmp4: abs(p1-p0) */
     "pasubub    %[ftmp4],   %[ftmp5],           %[ftmp6]            \n\t"
-    "psubusb    %[ftmp7],   %[ftmp4],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp4],           %[ftmp13]           \n\t"
     "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
     /* abs (p2-p1) */
     "pasubub    %[ftmp7],   %[ftmp2],           %[ftmp5]            \n\t"
-    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
     "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
     /* abs (p3-p2) */
     "pasubub    %[ftmp7],   %[ftmp1],           %[ftmp2]            \n\t"
-    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
     "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
-    /* abs (p0-q0) */
+
+    "gsldlc1    %[ftmp13],  0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp13],  0x00(%[blimit])                         \n\t"
+    "gsldlc1    %[ftmp7],   0x07(%[thresh])                         \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[thresh])                         \n\t"
+    /* abs (p0-q0) * 2 */
     "pasubub    %[ftmp1],   %[ftmp9],           %[ftmp6]            \n\t"
     "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
-    /* abs (p1-q1) */
+    /* abs (p1-q1) / 2 */
     "pasubub    %[ftmp12],  %[ftmp10],          %[ftmp5]            \n\t"
     "and        %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"
     "li         %[tmp0],    0x01                                    \n\t"
     "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
     "psrlh      %[ftmp12],  %[ftmp12],          %[ftmp8]            \n\t"
     "paddusb    %[ftmp12],  %[ftmp1],           %[ftmp12]           \n\t"
-
-    "gsldlc1    %[ftmp8],   0x07(%[blimit])                         \n\t"
-    "gsldrc1    %[ftmp8],   0x00(%[blimit])                         \n\t"
-    "psubusb    %[ftmp12],  %[ftmp12],          %[ftmp8]            \n\t"
+    "psubusb    %[ftmp12],  %[ftmp12],          %[ftmp13]           \n\t"
     "or         %[ftmp0],   %[ftmp0],           %[ftmp12]           \n\t"
     "xor        %[ftmp12],  %[ftmp12],          %[ftmp12]           \n\t"
+    /* ftmp0: mask */
     "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp12]           \n\t"
 
-    "gsldlc1    %[ftmp8],   0x07(%[thresh])                         \n\t"
-    "gsldrc1    %[ftmp8],   0x00(%[thresh])                         \n\t"
-    /* ftmp3: abs(q1-q0)  ftmp4: abs(p1-p0) */
-    "psubusb    %[ftmp4],   %[ftmp4],           %[ftmp8]            \n\t"
-    "psubusb    %[ftmp3],   %[ftmp3],           %[ftmp8]            \n\t"
+    /* abs(p1-p0) - thresh */
+    "psubusb    %[ftmp4],   %[ftmp4],           %[ftmp7]            \n\t"
+    /* abs(q1-q0) - thresh */
+    "psubusb    %[ftmp3],   %[ftmp3],           %[ftmp7]            \n\t"
     "or         %[ftmp3],   %[ftmp4],           %[ftmp3]            \n\t"
     "pcmpeqb    %[ftmp3],   %[ftmp3],           %[ftmp12]           \n\t"
     "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    /* ftmp1: hev */
     "xor        %[ftmp1],   %[ftmp3],           %[ftmp1]            \n\t"
 
+    /* ftmp2:ps2, ftmp5:ps1, ftmp6:ps0, ftmp9:qs0, ftmp10:qs1, ftmp11:qs2 */
     "xor        %[ftmp11],  %[ftmp11],          %[ff_pb_80]         \n\t"
     "xor        %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
     "xor        %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
@@ -837,30 +816,30 @@
     "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
     "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
     "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
+    /* filter_value &= mask */
     "and        %[ftmp0],   %[ftmp0],           %[ftmp3]            \n\t"
+    /* Filter2 = filter_value & hev */
     "and        %[ftmp3],   %[ftmp1],           %[ftmp0]            \n\t"
+    /* filter_value &= ~hev */
     "pandn      %[ftmp0],   %[ftmp1],           %[ftmp0]            \n\t"
 
     "paddsb     %[ftmp4],   %[ftmp3],           %[ff_pb_04]         \n\t"
     "li         %[tmp0],    0x0b                                    \n\t"
     "mtc1       %[tmp0],    %[ftmp12]                               \n\t"
-    "xor        %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t"
-    "xor        %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t"
     "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp4]            \n\t"
     "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp4]            \n\t"
     "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t"
     "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t"
     "packsshb   %[ftmp4],   %[ftmp7],           %[ftmp8]            \n\t"
+    /* ftmp9: qs0 */
     "psubsb     %[ftmp9],   %[ftmp9],           %[ftmp4]            \n\t"
     "paddsb     %[ftmp3],   %[ftmp3],           %[ff_pb_03]         \n\t"
-    "xor        %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t"
-    "xor        %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t"
     "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp3]            \n\t"
     "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp3]            \n\t"
     "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t"
     "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t"
     "packsshb   %[ftmp3],   %[ftmp7],           %[ftmp8]            \n\t"
-
+    /* ftmp6: ps0 */
     "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp3]            \n\t"
 
     "li         %[tmp0],    0x07                                    \n\t"
@@ -872,8 +851,10 @@
     "pmulhh     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
     VP8_MBLOOP_VPSRAB_ADDT
     "psubsb     %[ftmp4],   %[ftmp9],           %[ftmp3]            \n\t"
+    /* ftmp9: oq0 */
     "xor        %[ftmp9],   %[ftmp4],           %[ff_pb_80]         \n\t"
     "paddsb     %[ftmp4],   %[ftmp6],           %[ftmp3]            \n\t"
+    /* ftmp6: op0 */
     "xor        %[ftmp6],   %[ftmp4],           %[ff_pb_80]         \n\t"
 
     VP8_MBLOOP_VPSRAB_ADDH
@@ -882,8 +863,10 @@
     "pmulhh     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
     VP8_MBLOOP_VPSRAB_ADDT
     "psubsb     %[ftmp4],   %[ftmp10],          %[ftmp3]            \n\t"
+    /* ftmp10: oq1 */
     "xor        %[ftmp10],   %[ftmp4],          %[ff_pb_80]         \n\t"
     "paddsb     %[ftmp4],   %[ftmp5],           %[ftmp3]            \n\t"
+    /* ftmp5: op1 */
     "xor        %[ftmp5],   %[ftmp4],           %[ff_pb_80]         \n\t"
 
     VP8_MBLOOP_VPSRAB_ADDH
@@ -891,8 +874,10 @@
     "pmulhh     %[ftmp8],   %[ftmp8],           %[ff_ph_0900]       \n\t"
     VP8_MBLOOP_VPSRAB_ADDT
     "psubsb     %[ftmp4],   %[ftmp11],          %[ftmp3]            \n\t"
+    /* ftmp11: oq2 */
     "xor        %[ftmp11],  %[ftmp4],           %[ff_pb_80]         \n\t"
     "paddsb     %[ftmp4],   %[ftmp2],           %[ftmp3]            \n\t"
+    /* ftmp2: op2 */
     "xor        %[ftmp2],   %[ftmp4],           %[ff_pb_80]         \n\t"
 
     "ldc1       %[ftmp12],  0x00(%[srct])                           \n\t"
@@ -916,41 +901,40 @@
     "punpcklhw  %[ftmp10],  %[ftmp1],           %[ftmp3]            \n\t"
     "punpckhhw  %[ftmp11],  %[ftmp1],           %[ftmp3]            \n\t"
 
-    "punpcklwd  %[ftmp0],   %[ftmp6],           %[ftmp10]           \n\t"
-    "punpckhwd  %[ftmp1],   %[ftmp6],           %[ftmp10]           \n\t"
-
-    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
-    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
-    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
-    "gssdlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-
     "punpcklwd  %[ftmp0],   %[ftmp7],           %[ftmp11]           \n\t"
     "punpckhwd  %[ftmp1],   %[ftmp7],           %[ftmp11]           \n\t"
-    MMI_ADDU(%[addr0], %[addr0], %[src_pixel_step])
-    "gssdlc1    %[ftmp0],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp0],   0x00(%[addr0])                          \n\t"
-    MMI_ADDU(%[addr0], %[addr0], %[src_pixel_step])
-    "gssdlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklwd  %[ftmp0],   %[ftmp6],           %[ftmp10]           \n\t"
+    "punpckhwd  %[ftmp1],   %[ftmp6],           %[ftmp10]           \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
 
     "punpcklwd  %[ftmp1],   %[ftmp5],           %[ftmp9]            \n\t"
     "punpckhwd  %[ftmp0],   %[ftmp5],           %[ftmp9]            \n\t"
-    MMI_SUBU(%[addr0], %[src_ptr], %[src_pixel_step])
-    "gssdlc1    %[ftmp0],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp0],   0x00(%[addr0])                          \n\t"
-    MMI_SUBU(%[addr0], %[addr0], %[src_pixel_step])
-    "gssdlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
 
     "punpcklwd  %[ftmp1],   %[ftmp4],           %[ftmp8]            \n\t"
     "punpckhwd  %[ftmp0],   %[ftmp4],           %[ftmp8]            \n\t"
-    MMI_SUBU(%[addr0], %[addr0], %[src_pixel_step])
-    "gssdlc1    %[ftmp0],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp0],   0x00(%[addr0])                          \n\t"
-    MMI_SUBU(%[addr0], %[addr0], %[src_pixel_step])
-    "gssdlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-    "gssdrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
     "addiu      %[count],   %[count],           -0x01               \n\t"
 
     MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
@@ -962,9 +946,9 @@
       [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
       [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
       [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
-      [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
-      [addr0]"=&r"(addr[0]),
-      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+      [ftmp12]"=&f"(ftmp[12]),            [ftmp13]"=&f"(ftmp[13]),
+      [tmp0]"=&r"(tmp[0]),                [src_ptr]"+&r"(src_ptr),
+      [count]"+&r"(count)
     : [limit]"r"(limit),                [blimit]"r"(blimit),
       [srct]"r"(srct),                  [thresh]"r"(thresh),
       [src_pixel_step]"r"((mips_reg)src_pixel_step),