ARM Skia NEON patches - 31 - Xfermode: xfer16
Xfermode: xfer16

This adds support for 16bit Xfermodes. It also tunes the gcc test
macros in xfer32() to add compatibility for gcc > 4.

Signed-off-by: Kévin PETIT <kevin.petit@arm.com>

BUG=
R=djsollen@google.com, mtklein@google.com, reed@google.com

Author: kevin.petit.arm@gmail.com

Review URL: https://codereview.chromium.org/33063002

git-svn-id: http://skia.googlecode.com/svn/trunk/src@12192 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/opts/SkBlitRow_opts_arm_neon.cpp b/opts/SkBlitRow_opts_arm_neon.cpp
index d9d40a1..672980d 100644
--- a/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/opts/SkBlitRow_opts_arm_neon.cpp
@@ -31,9 +31,7 @@
         vsrc = vld4_u8((uint8_t*)src);
 
         // Convert src to 565
-        vdst = vshll_n_u8(vsrc.val[NEON_R], 8);
-        vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_G], 8), 5);
-        vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_B], 8), 5+6);
+        vdst = SkPixel32ToPixel16_neon8(vsrc);
 
         // Store
         vst1q_u16(dst, vdst);
diff --git a/opts/SkColor_opts_neon.h b/opts/SkColor_opts_neon.h
index cd9e813..f812397 100644
--- a/opts/SkColor_opts_neon.h
+++ b/opts/SkColor_opts_neon.h
@@ -29,4 +29,40 @@
     return ret;
 }
 
+/* This function expands 8 pixels from RGB565 (R, G, B from high to low) to
+ * SkPMColor (all possible configurations supported) in the exact same way as
+ * SkPixel16ToPixel32.
+ */
+static inline uint8x8x4_t SkPixel16ToPixel32_neon8(uint16x8_t vsrc) {
+
+    uint8x8x4_t ret;
+    uint8x8_t vr, vg, vb;
+
+    vr = vmovn_u16(vshrq_n_u16(vsrc, SK_R16_SHIFT));
+    vg = vmovn_u16(vshrq_n_u16(vshlq_n_u16(vsrc, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS));
+    vb = vmovn_u16(vsrc & vdupq_n_u16(SK_B16_MASK));
+
+    ret.val[NEON_A] = vdup_n_u8(0xFF);
+    ret.val[NEON_R] = vshl_n_u8(vr, 8 - SK_R16_BITS) | vshr_n_u8(vr, 2 * SK_R16_BITS - 8);
+    ret.val[NEON_G] = vshl_n_u8(vg, 8 - SK_G16_BITS) | vshr_n_u8(vg, 2 * SK_G16_BITS - 8);
+    ret.val[NEON_B] = vshl_n_u8(vb, 8 - SK_B16_BITS) | vshr_n_u8(vb, 2 * SK_B16_BITS - 8);
+
+    return ret;
+}
+
+/* This function packs 8 pixels from SkPMColor (all possible configurations
+ * supported) to RGB565 (R, G, B from high to low) in the exact same way as
+ * SkPixel32ToPixel16.
+ */
+static inline uint16x8_t SkPixel32ToPixel16_neon8(uint8x8x4_t vsrc) {
+
+    uint16x8_t ret;
+
+    ret = vshll_n_u8(vsrc.val[NEON_R], 8);
+    ret = vsriq_n_u16(ret, vshll_n_u8(vsrc.val[NEON_G], 8), SK_R16_BITS);
+    ret = vsriq_n_u16(ret, vshll_n_u8(vsrc.val[NEON_B], 8), SK_R16_BITS + SK_G16_BITS);
+
+    return ret;
+}
+
 #endif /* #ifndef SkColor_opts_neon_DEFINED */
diff --git a/opts/SkXfermode_opts_arm_neon.cpp b/opts/SkXfermode_opts_arm_neon.cpp
index f4ff18c..b8d8ef5 100644
--- a/opts/SkXfermode_opts_arm_neon.cpp
+++ b/opts/SkXfermode_opts_arm_neon.cpp
@@ -574,13 +574,14 @@
 
     SkXfermodeProc proc = this->getProc();
     SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD);
+    SkASSERT(procSIMD != NULL);
 
     if (NULL == aa) {
         // Unrolled NEON code
         while (count >= 8) {
             uint8x8x4_t vsrc, vdst, vres;
 
-#if (__GNUC__ == 4) && (__GNUC_MINOR__ > 6)
+#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
             asm volatile (
                 "vld4.u8    %h[vsrc], [%[src]]!  \t\n"
                 "vld4.u8    %h[vdst], [%[dst]]   \t\n"
@@ -639,6 +640,74 @@
     }
 }
 
+void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst,
+                                     const SkPMColor* SK_RESTRICT src, int count,
+                                     const SkAlpha* SK_RESTRICT aa) const {
+    SkASSERT(dst && src && count >= 0);
+
+    SkXfermodeProc proc = this->getProc();
+    SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD);
+    SkASSERT(procSIMD != NULL);
+
+    if (NULL == aa) {
+        while(count >= 8) {
+            uint16x8_t vdst, vres16;
+            uint8x8x4_t vdst32, vsrc, vres;
+
+            vdst = vld1q_u16(dst);
+
+#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
+            asm volatile (
+                "vld4.u8    %h[vsrc], [%[src]]!  \t\n"
+                : [vsrc] "=w" (vsrc), [src] "+&r" (src)
+                : :
+            );
+#else
+            register uint8x8_t d0 asm("d0");
+            register uint8x8_t d1 asm("d1");
+            register uint8x8_t d2 asm("d2");
+            register uint8x8_t d3 asm("d3");
+
+            asm volatile (
+                "vld4.u8    {d0-d3},[%[src]]!;"
+                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
+                  [src] "+&r" (src)
+                : :
+            );
+            vsrc.val[0] = d0;
+            vsrc.val[1] = d1;
+            vsrc.val[2] = d2;
+            vsrc.val[3] = d3;
+#endif
+
+            vdst32 = SkPixel16ToPixel32_neon8(vdst);
+            vres = procSIMD(vsrc, vdst32);
+            vres16 = SkPixel32ToPixel16_neon8(vres);
+
+            vst1q_u16(dst, vres16);
+
+            count -= 8;
+            dst += 8;
+        }
+        for (int i = 0; i < count; i++) {
+            SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
+            dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC));
+        }
+    } else {
+        for (int i = count - 1; i >= 0; --i) {
+            unsigned a = aa[i];
+            if (0 != a) {
+                SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
+                SkPMColor C = proc(src[i], dstC);
+                if (0xFF != a) {
+                    C = SkFourByteInterp(C, dstC, a);
+                }
+                dst[i] = SkPixel32ToPixel16_ToU16(C);
+            }
+        }
+    }
+}
+
 #ifdef SK_DEVELOPER
 void SkNEONProcCoeffXfermode::toString(SkString* str) const {
     this->INHERITED::toString(str);
diff --git a/opts/SkXfermode_opts_arm_neon.h b/opts/SkXfermode_opts_arm_neon.h
index 702b216..4c88fc7 100644
--- a/opts/SkXfermode_opts_arm_neon.h
+++ b/opts/SkXfermode_opts_arm_neon.h
@@ -11,6 +11,8 @@
 
     virtual void xfer32(SkPMColor dst[], const SkPMColor src[], int count,
                         const SkAlpha aa[]) const SK_OVERRIDE;
+    virtual void xfer16(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src,
+                        int count, const SkAlpha* SK_RESTRICT aa) const SK_OVERRIDE;
 
     SK_DEVELOPER_TO_STRING()
     SK_DECLARE_PUBLIC_FLATTENABLE_DESERIALIZATION_PROCS(SkNEONProcCoeffXfermode)