ARM Skia NEON patches - 23 - S32_D565_Opaque_Dither cleanup/bugfix/speed




BlitRow565: S32_D565_Opaque_Dither: cleaning / bugfix

This patch brings a little code cleaning (spaces/comments) and a little
speed improvement (by using post-incrementation in the asm) but more
importantly it fixes a bug on Linux. The new code now supports ARGB
as well as ABGR.

I removed the comment as I have confirmed with benchmarks that this
code bring a *massive* (3x-7x) speedup compared to the C code.

Signed-off-by: Kévin PETIT <kevin.petit@arm.com>

BUG=
R=djsollen@google.com, mtklein@google.com

Author: kevin.petit.arm@gmail.com

Review URL: https://chromiumcodereview.appspot.com/22269003

git-svn-id: http://skia.googlecode.com/svn/trunk/src@11339 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/opts/SkBlitRow_opts_arm_neon.cpp b/opts/SkBlitRow_opts_arm_neon.cpp
index 1bc0ea1..705ee99 100644
--- a/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/opts/SkBlitRow_opts_arm_neon.cpp
@@ -1125,13 +1125,6 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
-/* 2009/10/27: RBE says "a work in progress"; debugging says ok;
- * speedup untested, but ARM version is 26 insns/iteration and
- * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
- * which is 10x the native version; that's pure instruction counts,
- * not accounting for any instruction or memory latencies.
- */
-
 #undef    DEBUG_S32_OPAQUE_DITHER
 
 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
@@ -1150,18 +1143,23 @@
         uint16x8_t dr, dg, db;
         uint16x8_t dst8;
 
-        /* source is in ABGR ordering (R == lsb) */
         {
         register uint8x8_t d0 asm("d0");
         register uint8x8_t d1 asm("d1");
         register uint8x8_t d2 asm("d2");
         register uint8x8_t d3 asm("d3");
 
-        asm ("vld4.8    {d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
-            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
-            : "r" (src)
-                    );
-            sr = d0; sg = d1; sb = d2;
+        asm (
+            "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
+            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
+            :
+        );
+        sg = d1;
+#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
+        sr = d2; sb = d0;
+#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
+        sr = d0; sb = d2;
+#endif
         }
         /* XXX: if we want to prefetch, hide it in the above asm()
          * using the gcc __builtin_prefetch(), the prefetch will
@@ -1169,34 +1167,34 @@
          * at the top of the loop, just after the vld4.
          */
 
-        /* sr = sr - (sr>>5) + d */
+        // sr = sr - (sr>>5) + d
         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
         dr = vaddl_u8(sr, d);
 
-        /* sb = sb - (sb>>5) + d */
+        // sb = sb - (sb>>5) + d
         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
         db = vaddl_u8(sb, d);
 
-        /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
+        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
-        dg = vaddl_u8(sg, vshr_n_u8(d,1));
-        /* XXX: check that the "d>>1" here is hoisted */
+        dg = vaddl_u8(sg, vshr_n_u8(d, 1));
 
-        /* pack high bits of each into 565 format  (rgb, b is lsb) */
+        // pack high bits of each into 565 format  (rgb, b is lsb)
         dst8 = vshrq_n_u16(db, 3);
         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
-        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);
+        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
 
-        /* store it */
+        // store it
         vst1q_u16(dst, dst8);
 
 #if    defined(DEBUG_S32_OPAQUE_DITHER)
-        /* always good to know if we generated good results */
+        // always good to know if we generated good results
         {
         int i, myx = x, myy = y;
         DITHER_565_SCAN(myy);
         for (i=0;i<UNROLL;i++) {
-            SkPMColor c = src[i];
+            // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
+            SkPMColor c = src[i-8];
             unsigned dither = DITHER_VALUE(myx);
             uint16_t val = SkDitherRGB32To565(c, dither);
             if (val != dst[i]) {
@@ -1209,14 +1207,14 @@
 #endif
 
         dst += UNROLL;
-        src += UNROLL;
+        // we don't need to increment src as the asm above has already done it
         count -= UNROLL;
-        x += UNROLL;        /* probably superfluous */
+        x += UNROLL;        // probably superfluous
     }
     }
 #undef    UNROLL
 
-    /* residuals */
+    // residuals
     if (count > 0) {
         DITHER_565_SCAN(y);
         do {