ARM Skia NEON patches - 21 - new NEON S32_D565_Opaque




BlitRow565: NEON version of S32_D565_Opaque

Here's a new implementation of S32_D565_Opaque in NEON. It
improves dramatically the speed compared to S32A_D565_Opaque.

Here are the benchmark results (speedup vs. existing NEON):

+-------+-----------+------------+
| count | Cortex-A9 | Cortex-A15 |
+-------+-----------+------------+
| 1     | +130%     | +139%      |
+-------+-----------+------------+
| 2     | +65,2%    | +51%       |
+-------+-----------+------------+
| 4     | -25,5%    | +10,2%     |
+-------+-----------+------------+
| 8     | +63,8%    | +32,1%     |
+-------+-----------+------------+
| 16    | +110%     | +49,2%     |
+-------+-----------+------------+
| 64    | +153%     | +123,5%    |
+-------+-----------+------------+
| 256   | +151%     | +144,7%    |
+-------+-----------+------------+
| 1024  | +272%     | +157,2%    |
+-------+-----------+------------+

Signed-off-by: Kévin PETIT <kevin.petit@arm.com>

BUG=
R=djsollen@google.com, mtklein@google.com

Author: kevin.petit.arm@gmail.com

Review URL: https://chromiumcodereview.appspot.com/22351006

git-svn-id: http://skia.googlecode.com/svn/trunk/src@11415 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/opts/SkBlitRow_opts_arm_neon.cpp b/opts/SkBlitRow_opts_arm_neon.cpp
index 705ee99..ffa0a8b 100644
--- a/opts/SkBlitRow_opts_arm_neon.cpp
+++ b/opts/SkBlitRow_opts_arm_neon.cpp
@@ -15,9 +15,45 @@
 #include "SkUtils.h"
 
 #include "SkCachePreload_arm.h"
-
+#include "SkColor_opts_neon.h"
 #include <arm_neon.h>
 
+void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
+                           const SkPMColor* SK_RESTRICT src, int count,
+                           U8CPU alpha, int /*x*/, int /*y*/) {
+    SkASSERT(255 == alpha);
+
+    while (count >= 8) {
+        uint8x8x4_t vsrc;
+        uint16x8_t vdst;
+
+        // Load
+        vsrc = vld4_u8((uint8_t*)src);
+
+        // Convert src to 565
+        vdst = vshll_n_u8(vsrc.val[NEON_R], 8);
+        vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_G], 8), 5);
+        vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_B], 8), 5+6);
+
+        // Store
+        vst1q_u16(dst, vdst);
+
+        // Prepare next iteration
+        dst += 8;
+        src += 8;
+        count -= 8;
+    };
+
+    // Leftovers
+    while (count > 0) {
+        SkPMColor c = *src++;
+        SkPMColorAssert(c);
+        *dst = SkPixel32ToPixel16_ToU16(c);
+        dst++;
+        count--;
+    };
+}
+
 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
                            const SkPMColor* SK_RESTRICT src, int count,
                            U8CPU alpha, int /*x*/, int /*y*/) {
@@ -1330,10 +1366,10 @@
 
 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
     // no dither
-    // NOTE: For the two functions below, we don't have a special version
-    //       that assumes that each source pixel is opaque. But our S32A is
-    //       still faster than the default, so use it.
-    S32A_D565_Opaque_neon,  // really S32_D565_Opaque
+    // NOTE: For the S32_D565_Blend function below, we don't have a special
+    //       version that assumes that each source pixel is opaque. But our
+    //       S32A is still faster than the default, so use it.
+    S32_D565_Opaque_neon,
     S32A_D565_Blend_neon,   // really S32_D565_Blend
     S32A_D565_Opaque_neon,
     S32A_D565_Blend_neon,
diff --git a/opts/SkColor_opts_neon.h b/opts/SkColor_opts_neon.h
new file mode 100644
index 0000000..adc2641
--- /dev/null
+++ b/opts/SkColor_opts_neon.h
@@ -0,0 +1,12 @@
+#ifndef SkColor_opts_neon_DEFINED
+#define SkColor_opts_neon_DEFINED
+
+#include "SkTypes.h"
+
+#define NEON_A (SK_A32_SHIFT / 8)
+#define NEON_R (SK_R32_SHIFT / 8)
+#define NEON_G (SK_G32_SHIFT / 8)
+#define NEON_B (SK_B32_SHIFT / 8)
+
+#endif /* #ifndef SkColor_opts_neon_DEFINED */
+