NEON speed up
add TransformDC special case, and make the switch function inlined.
Recovers a few of the CPU lost during the addition of TransformAC3
(only on ARM)
Change-Id: I21c1f0c6a9cb9d1dfc1e307b4f473a2791273bd6
diff --git a/src/dec/frame.c b/src/dec/frame.c
index 5a4f814..bf46195 100644
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@@ -671,7 +671,7 @@
memcpy(dst, src, 4);
}
-static void DoTransform(uint32_t bits, const int16_t* const src,
+static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src,
uint8_t* const dst) {
switch (bits >> 30) {
case 3:
diff --git a/src/dsp/dec_neon.c b/src/dsp/dec_neon.c
index f9e27d5..8f19b84 100644
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
@@ -160,7 +160,7 @@
//-----------------------------------------------------------------------------
// Inverse transforms (Paragraph 14.4)
-static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
+static void TransformOne(const int16_t* in, uint8_t* dst) {
const int kBPS = BPS;
const int16_t constants[] = {20091, 17734, 0, 0};
/* kC1, kC2. Padded because vld1.16 loads 8 bytes
@@ -309,13 +309,44 @@
);
}
-static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) {
- TransformOneNEON(in, dst);
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+ TransformOne(in, dst);
if (do_two) {
- TransformOneNEON(in + 16, dst + 4);
+ TransformOne(in + 16, dst + 4);
}
}
+static void TransformDC(const int16_t* in, uint8_t* dst) {
+ const int DC = (in[0] + 4) >> 3;
+ const int kBPS = BPS;
+ __asm__ volatile (
+ "vdup.16 q1, %[DC] \n"
+
+ "vld1.32 d0[0], [%[dst]], %[kBPS] \n"
+ "vld1.32 d1[0], [%[dst]], %[kBPS] \n"
+ "vld1.32 d0[1], [%[dst]], %[kBPS] \n"
+ "vld1.32 d1[1], [%[dst]], %[kBPS] \n"
+
+ "sub %[dst], %[dst], %[kBPS], lsl #2 \n"
+
+ // add DC and convert to s16.
+ "vaddw.u8 q2, q1, d0 \n"
+ "vaddw.u8 q3, q1, d1 \n"
+ // convert back to u8 with saturation
+ "vqmovun.s16 d0, q2 \n"
+ "vqmovun.s16 d1, q3 \n"
+
+ "vst1.32 d0[0], [%[dst]], %[kBPS] \n"
+ "vst1.32 d1[0], [%[dst]], %[kBPS] \n"
+ "vst1.32 d0[1], [%[dst]], %[kBPS] \n"
+ "vst1.32 d1[1], [%[dst]] \n"
+ : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */
+ : [kBPS] "r"(kBPS), /* constants */
+ [DC] "r"(DC)
+ : "memory", "q0", "q1", "q2", "q3" /* clobbered */
+ );
+}
+
static void TransformWHT(const int16_t* in, int16_t* out) {
const int kStep = 32; // The store is only incrementing the pointer as if we
// had stored a single byte.
@@ -392,7 +423,9 @@
void VP8DspInitNEON(void) {
#if defined(WEBP_USE_NEON)
- VP8Transform = TransformTwoNEON;
+ VP8Transform = TransformTwo;
+ VP8TransformAC3 = TransformOne; // no special code here
+ VP8TransformDC = TransformDC;
VP8TransformWHT = TransformWHT;
VP8SimpleVFilter16 = SimpleVFilter16NEON;