cpu_ref/rsCpuIntrinsics_advsimd_Resize.S - platform/frameworks/rs - Git at Google

 /*
  * Copyright (C) 2015 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
 #define END(f) .size f, .-f;

 /* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1
  * integer (bicubic has a little overshoot).  It would also be possible to add
  * a temporary DC bias to eliminate the sign bit for more precision, but that's
  * extra arithmetic.
  */
 .set VERTBITS, 14

 /* The size of the scratch buffer in which we store our vertically convolved
  * intermediates.
  */
 .set CHUNKSHIFT, 7       /* 5 tests better for uchar4, but 7 is necessary for ridiculous (10:1) scale factors */
 .set CHUNKSIZE, (1 << CHUNKSHIFT)

 /* The number of components processed in a single iteration of the innermost
  * loop.
  */
 .set VECSHIFT, 3
 .set VECSIZE, (1<<VECSHIFT)

 /* Read four different lines (except at edges where addresses may be clamped,
  * which is why we don't simply take base and stride registers), and multiply
  * and accumulate them by the coefficients in v3[0..3], leaving the results in
  * v12.  This gives eight 16-bit results representing a horizontal line of 2-8
  * input pixels (depending on number of components per pixel) to be fed into
  * the horizontal scaling pass.
  *
  * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are
  * known to represent negative values and VMLS is used to implement this).
  * Output is VERTBITS signed fixed-point, which must leave room for a little
  * v12.  This gives eight 16-bit results.
  */
 .macro vert8, dstlo=v12.4h, dsthi=v12.8h
         ld1         {v8.8b}, [x4], #8
         ld1         {v9.8b}, [x5], #8
         ld1         {v10.8b}, [x6], #8
         ld1         {v11.8b}, [x7], #8
         uxtl        v8.8h, v8.8b
         uxtl        v9.8h, v9.8b
         uxtl        v10.8h, v10.8b
         uxtl        v11.8h, v11.8b
         umull       v12.4s, v9.4h, v3.h[1]
         umull2      v13.4s, v9.8h, v3.h[1]
         umlsl       v12.4s, v8.4h, v3.h[0]
         umlsl2      v13.4s, v8.8h, v3.h[0]
         umlal       v12.4s, v10.4h, v3.h[2]
         umlal2      v13.4s, v10.8h, v3.h[2]
         umlsl       v12.4s, v11.4h, v3.h[3]
         umlsl2      v13.4s, v11.8h, v3.h[3]

         /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies),
          * minus VERTBITS (the number of fraction bits we want to keep from
          * here on).
          */
         sqshrn      \dstlo, v12.4s, #8 + (16 - VERTBITS)
         sqshrn2     \dsthi, v13.4s, #8 + (16 - VERTBITS)
 .endm

 /* As above, but only four 16-bit results into v12hi.
  */
 .macro vert4, dst=v12.8h
         ld1         {v8.s}[0], [x4], #4
         ld1         {v9.s}[0], [x5], #4
         ld1         {v10.s}[0], [x6], #4
         ld1         {v11.s}[0], [x7], #4
         uxtl        v8.8h, v8.8b
         uxtl        v9.8h, v9.8b
         uxtl        v10.8h, v10.8b
         uxtl        v11.8h, v11.8b
         umull       v12.4s, v9.4h, v3.h[1]
         umlsl       v12.4s, v8.4h, v3.h[0]
         umlal       v12.4s, v10.4h, v3.h[2]
         umlsl       v12.4s, v11.4h, v3.h[3]
 .ifc \dst,v12.8h
         sqshrn2     \dst, v12.4s, #8 + (16 - VERTBITS)
 .else
         sqshrn      \dst, v12.4s, #8 + (16 - VERTBITS)
 .endif
 .endm


 /* During horizontal resize having CHUNKSIZE input available means being able
  * to produce a varying amount of output, depending on the phase of the data.
  * This function calculates the minimum number of VECSIZE chunks extracted from
  * a CHUNKSIZE window (x1), and the threshold value for when the count will be
  * one higher than that (x0).
  * These work out, conveniently, to be the quotient and remainder from:
  *      (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE)
  *
  * The two values are packed together in a uint64_t for convenience; and
  * they are, in fact, used this way as an arithmetic short-cut later on.
  */
 /* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc) */
 ENTRY(rsdIntrinsicResize_oscctl_K)
         lsl         x2, x0, #VECSHIFT
         mov         x0, #(CHUNKSIZE << 16) - 1
         add         x0, x0, x2
         udiv        x1, x0, x2
         msub        x0, x1, x2, x0
         add         x0, x0, x1, LSL #32
         ret
 END(rsdIntrinsicResize_oscctl_K)

 /* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code.
  * For the most part the vertical pass (the outer loop) is the same for all
  * versions.  Exceptions are handled in-line with conditional assembly.
  */
 .irp comp, 1, 2, 4
 .if \comp == 1
 .set COMPONENT_SHIFT, 0
 .elseif \comp == 2
 .set COMPONENT_SHIFT, 1
 .elseif \comp == 4
 .set COMPONENT_SHIFT, 2
 .else
 .error "Unknown component count"
 .endif
 .set COMPONENT_COUNT, (1 << COMPONENT_SHIFT)
 .set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT)

 .set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2

 /* void rsdIntrinsicResizeB1_K(
  *             uint8_t * restrict dst,          // x0
  *             size_t count,                    // x1
  *             uint32_t xf,                     // x2
  *             uint32_t xinc,                   // x3
  *             uint8_t const * restrict srcn,   // x4
  *             uint8_t const * restrict src0,   // x5
  *             uint8_t const * restrict src1,   // x6
  *             uint8_t const * restrict src2,   // x7
  *             size_t xclip,                    // [sp,#0]  -> [sp,#80] -> x12
  *             size_t avail,                    // [sp,#8]  -> [sp,#88] -> x11
  *             uint64_t osc_ctl,                // [sp,#16] -> [sp,#96] -> x10
  *             int32 const *yr,                 // [sp,#24] -> [sp,#104] -> v4   (copied to v3   for scalar access)
  */
 ENTRY(rsdIntrinsicResizeB\comp\()_K)
             sub         x8, sp, #48
             sub         sp, sp, #80
             st1         {v8.1d - v11.1d}, [sp]
             st1         {v12.1d - v15.1d}, [x8]
             str         x19, [x8, #32]

             /* align the working buffer on the stack to make it easy to use bit
              * twiddling for address calculations.
              */
             sub         x12, sp, #BUFFER_SIZE
             bic         x12, x12, #(1 << (CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1)) - 1

             ldr         x8, [sp,#104]           // yr
             adr         x9, 8f
             ld1         {v4.4s}, [x8]
             ld1         {v5.8h}, [x9]
             sqxtun      v4.4h, v4.4s            // yr
             dup         v6.8h, w2
             dup         v7.8h, w3
             mla         v6.8h, v5.8h, v7.8h     // vxf
             shl         v7.8h, v7.8h, #VECSHIFT // vxinc

             /* Compute starting condition for oscillator used to compute ahead
              * of time how many iterations are possible before needing to
              * refill the working buffer.  This is based on the fixed-point
              * index of the last element in the vector of pixels processed in
              * each iteration, counting up until it would overflow.
              */
             sub         x8, x2, x3
             lsl         x9, x3, #VECSHIFT
             add         x8, x8, x9

             ldr         x10, [sp,#96]           // osc_ctl
             ldp         x13,x11, [sp,#80]       // xclip, avail

             mov         x19, sp
             mov         sp, x12

             /* x4-x7 contain pointers to the four lines of input to be
              * convolved.  These pointers have been clamped vertically and
              * horizontally (which is why it's not a simple row/stride pair),
              * and the xclip argument (now in x13) indicates how many pixels
              * from true the x position of the pointer is.  This value should
              * be 0, 1, or 2 only.
              *
              * Start by placing four pixels worth of input at the far end of
              * the buffer.  As many as two of these may be clipped, so four
              * pixels are fetched, and then the first pixel is duplicated and
              * the data shifted according to xclip.  The source pointers are
              * then also adjusted according to xclip so that subsequent fetches
              * match.
              */
             mov         v3.8b, v4.8b  /* make y coeffs available for vert4 and vert8 macros */
             sub         x14, x12, x13, LSL #(COMPONENT_SHIFT + 1)
             add         x15, x12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
             add         x14, x14, #4 * COMPONENT_COUNT * 2
 .if \comp == 1
             vert4       v12.4h
             dup         v11.4h, v12.h[0]
             st1         {v11.4h,v12.4h}, [x12]
             ld1         {v12.4h}, [x14]
             st1         {v12.4h}, [x15]
 .elseif \comp == 2
             vert8
             dup         v11.4s, v12.s[0]
             st1         {v11.8h,v12.8h}, [x12]
             ld1         {v12.8h}, [x14]
             st1         {v12.8h}, [x15]
 .elseif \comp == 4
             vert8       v14.4h, v14.8h
             vert8       v15.4h, v15.8h
             dup         v12.2d, v14.d[0]
             dup         v13.2d, v14.d[0]
             st1         {v12.8h,v13.8h}, [x12], #32
             st1         {v14.8h,v15.8h}, [x12]
             sub         x12, x12, #32
             ld1         {v11.8h,v12.8h}, [x14]
             st1         {v11.8h,v12.8h}, [x15]
 .endif
             /* Count off four pixels into the working buffer.
              */
             sub         x11, x11, #4
             /* Incoming pointers were to the first _legal_ pixel.  Four pixels
              * were read unconditionally, but some may have been discarded by
              * xclip, so we rewind the pointers to compensate.
              */
             sub         x4, x4, x13, LSL #(COMPONENT_SHIFT)
             sub         x5, x5, x13, LSL #(COMPONENT_SHIFT)
             sub         x6, x6, x13, LSL #(COMPONENT_SHIFT)
             sub         x7, x7, x13, LSL #(COMPONENT_SHIFT)

             /* First tap starts where we just pre-filled, at the end of the
              * buffer.
              */
             add         x2, x2, #(CHUNKSIZE * 2 - 4) << 16

             /* Use overflowing arithmetic to implement wraparound array
              * indexing.
              */
             lsl         x2, x2, #(47 - CHUNKSHIFT)
             lsl         x3, x3, #(47 - CHUNKSHIFT)


             /* Start of outermost loop.
              * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the
              * number of iterations of the inner loop that can be performed and
              * get into that.
              *
              * The fill is complicated by the possibility of running out of
              * input before the scratch buffer is filled.  If this isn't a risk
              * then it's handled by the simple loop at 2:, otherwise the
              * horrible loop at 3:.
              */
 1:          mov         v3.8b, v4.8b            /* put y scaling coefficients somewhere handy */
             subs        x11, x11, #CHUNKSIZE
             bge         2f                      /* if at least CHUNKSIZE are available... */
             add         x11, x11, #CHUNKSIZE    /* if they're not... */
             b           4f
             /* ..just sneaking a literal in here after this unconditional branch.. */
 8:          .hword      0, 1, 2, 3, 4, 5, 6, 7
             /* basic fill loop, processing 8 bytes at a time until there are
              * fewer than eight bytes available.
              */
 3:          vert8
             sub         x11, x11, #8 / COMPONENT_COUNT
             st1         {v12.8h}, [x12], #16
 4:          cmp         x11, #8 / COMPONENT_COUNT - 1
             bgt         3b
 .if \comp == 4
             blt         3f
             /* The last pixel (four bytes) if necessary */
             vert4
 .else
             cmp         x11, #1
             blt         3f
             /* The last pixels if necessary */
             sub         x4, x4, #8
             sub         x5, x5, #8
             sub         x6, x6, #8
             sub         x7, x7, #8
             add         x4, x4, x11, LSL #(COMPONENT_SHIFT)
             add         x5, x5, x11, LSL #(COMPONENT_SHIFT)
             add         x6, x6, x11, LSL #(COMPONENT_SHIFT)
             add         x7, x7, x11, LSL #(COMPONENT_SHIFT)
             vert8
             sub         x11, sp, x11, LSL #(COMPONENT_SHIFT + 1)
             sub         sp, sp, #32
             sub         x11, x11, #16
 .if \comp == 1
             dup         v13.8h, v12.h[7]
 .elseif \comp == 2
             dup         v13.4s, v12.s[3]
 .endif
             st1         {v12.8h,v13.8h}, [sp]
             ld1         {v12.8h}, [x11]
             add         sp, sp, #32
             b           4f
 .endif
             /* Keep filling until we get to the end of this chunk of the buffer */
 3:
 .if \comp == 1
             dup         v12.8h, v12.h[7]
 .elseif \comp == 2
             dup         v12.4s, v12.s[3]
 .elseif \comp == 4
             dup         v12.2d, v12.d[1]
 .endif
 4:          st1         {v12.8h}, [x12], #16
             tst         x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
             bne         3b
             b           4f

 .align 4
 2:          /* Quickly pull a chunk of data into the working buffer.
              */
             vert8
             st1         {v12.8h}, [x12], #16
             vert8
             st1         {v12.8h}, [x12], #16
             tst         x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
             bne         2b
             cmp         x11, #0
             bne         3f
 4:          /* if we end with 0 pixels left we'll have nothing handy to spread
              * across to the right, so we rewind a bit.
              */
             mov         x11, #1
             sub         x4, x4, #COMPONENT_COUNT
             sub         x5, x5, #COMPONENT_COUNT
             sub         x6, x6, #COMPONENT_COUNT
             sub         x7, x7, #COMPONENT_COUNT
 3:          /* copy four taps (width of cubic window) to far end for overflow
              * address handling
              */
             sub         x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
             eor         x12, x13, #CHUNKSIZE * COMPONENT_COUNT * 2
 .if \comp == 1
             ld1         {v14.4h}, [x13]
 .elseif \comp == 2
             ld1         {v14.8h}, [x13]
 .elseif \comp == 4
             ld1         {v14.8h,v15.8h}, [x13]
 .endif
             add         x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
 .if \comp == 1
             st1         {v14.4h}, [x13]
 .elseif \comp == 2
             st1         {v14.8h}, [x13]
 .elseif \comp == 4
             st1         {v14.8h,v15.8h}, [x13]
 .endif
             /* The high 32-bits of x10 contains the maximum possible iteration
              * count, but if x8 is greater than the low 32-bits of x10 then
              * this indicates that the count must be reduced by one for this
              * iteration to avoid reading past the end of the available data.
              */
             sub         x13, x10, x8
             lsr         x13, x13, #32

             madd        x8, x13, x9, x8
             sub         x8, x8, #(CHUNKSIZE << 16)

             /* prefer to count pixels, rather than vectors, to clarify the tail
              * store case on exit.
              */
             lsl         x13, x13, #VECSHIFT
             cmp         x13, x1
             csel        x13, x1, x13, gt

             sub         x1, x1, x13

             lsl         x13, x13, #COMPONENT_SHIFT

             mov         w14, #0x8000
             movi        v30.8h, #3
             dup         v31.8h, w14

             cmp         x13, #0
             bgt         3f
             cmp         x1, #0
             bgt         1b     /* an extreme case where we shouldn't use code in this structure */
             b           9f

             .align 4
 2:          /* Inner loop continues here, but starts at 3:, see end of loop
              * below for explanation. */
 .if LOOP_OUTPUT_SIZE == 4
             st1         {v8.s}[0], [x0], #4
 .elseif LOOP_OUTPUT_SIZE == 8
             st1         {v8.8b}, [x0], #8
 .elseif LOOP_OUTPUT_SIZE == 16
             st1         {v8.16b}, [x0], #16
 .elseif LOOP_OUTPUT_SIZE == 32
             st1         {v8.16b,v9.16b}, [x0], #32
 .endif
             /* Inner loop:  here the four x coefficients for each tap are
              * calculated in vector code, and the addresses are calculated in
              * scalar code, and these calculations are interleaved.
              */
 3:          ushr        v8.8h, v6.8h, #1            // sxf
             lsr         x14, x2, #(63 - CHUNKSHIFT)
             sqrdmulh    v9.8h, v8.8h, v8.8h         // sxf**2
             add         x2, x2, x3
             sqrdmulh    v10.8h, v9.8h, v8.8h        // sxf**3
             lsr         x15, x2, #(63 - CHUNKSHIFT)
             sshll       v11.4s, v9.4h, #2
             sshll2      v12.4s, v9.8h, #2
             add         x2, x2, x3
             smlsl       v11.4s, v10.4h, v30.4h
             smlsl2      v12.4s, v10.8h, v30.8h
             lsr         x16, x2, #(63 - CHUNKSHIFT)

             shadd       v0.8h, v10.8h, v8.8h
             add         x2, x2, x3
             sub         v0.8h, v9.8h, v0.8h
             lsr         x17, x2, #(63 - CHUNKSHIFT)

             saddw       v1.4s, v11.4s, v9.4h
             saddw2      v13.4s, v12.4s, v9.8h
             add         x2, x2, x3
             shrn        v1.4h, v1.4s, #1
             shrn2       v1.8h, v13.4s, #1
             add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
             sub         v1.8h, v1.8h, v31.8h
             add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)

             saddw       v2.4s, v11.4s, v8.4h
             saddw2      v13.4s, v12.4s, v8.8h
             add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
             shrn        v2.4h, v2.4s, #1
             shrn2       v2.8h, v13.4s, #1
             add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
             neg         v2.8h, v2.8h

             shsub       v3.8h, v10.8h, v9.8h

             /* increment the x fractional parts (oveflow is ignored, as the
              * scalar arithmetic shadows this addition with full precision).
              */
             add         v6.8h, v6.8h, v7.8h

             /* At this point we have four pointers in x8-x11, pointing to the
              * four taps in the scratch buffer that must be convolved together
              * to produce an output pixel (one output pixel per pointer).
              * These pointers usually overlap, but their spacing is irregular
              * so resolving the redundancy through L1 is a pragmatic solution.
              *
              * The scratch buffer is made of signed 16-bit data, holding over
              * some extra precision, and overshoot, from the vertical pass.
              *
              * We also have the 16-bit unsigned fixed-point weights for each
              * of the four taps in v0 - v3.  That's eight pixels worth of
              * coefficients when we have only four pointers, so calculations
              * for four more pixels are interleaved with the fetch and permute
              * code for each variant in the following code.
              *
              * The data arrangement is less than ideal for any pixel format,
              * but permuting loads help to mitigate most of the problems.
              *
              * Note also that the two outside taps of a bicubic are negative,
              * but these coefficients are unsigned.  The sign is hard-coded by
              * use of multiply-and-subtract operations.
              */
 .if \comp == 1
             /* The uchar 1 case.
              * Issue one lanewise ld4.h to load four consecutive pixels from
              * one pointer (one pixel) into four different registers; then load
              * four consecutive s16 values from the next pointer (pixel) into
              * the next lane of those four registers, etc., so that we finish
              * with v12 - v15 representing the four taps, and each lane
              * representing a separate pixel.
              *
              * The first ld4 uses a splat to avoid any false dependency on
              * the previous state of the register.
              */
             ld4r        {v12.8h,v13.8h,v14.8h,v15.8h}, [x14]
             lsr         x14, x2, #(63 - CHUNKSHIFT)
             add         x2, x2, x3
             ld4         {v12.h,v13.h,v14.h,v15.h}[1], [x15]
             add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
             lsr         x15, x2, #(63 - CHUNKSHIFT)
             add         x2, x2, x3
             ld4         {v12.h,v13.h,v14.h,v15.h}[2], [x16]
             add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
             lsr         x16, x2, #(63 - CHUNKSHIFT)
             add         x2, x2, x3
             ld4         {v12.h,v13.h,v14.h,v15.h}[3], [x17]
             add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
             lsr         x17, x2, #(63 - CHUNKSHIFT)
             add         x2, x2, x3
             ld4         {v12.h,v13.h,v14.h,v15.h}[4], [x14]
             add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
             ld4         {v12.h,v13.h,v14.h,v15.h}[5], [x15]
             ld4         {v12.h,v13.h,v14.h,v15.h}[6], [x16]
             ld4         {v12.h,v13.h,v14.h,v15.h}[7], [x17]

             smull       v8.4s, v12.4h, v0.4h
             smull2      v9.4s, v12.8h, v0.8h
             smlsl       v8.4s, v13.4h, v1.4h
             smlsl2      v9.4s, v13.8h, v1.8h
             smlsl       v8.4s, v14.4h, v2.4h
             smlsl2      v9.4s, v14.8h, v2.8h
             smlal       v8.4s, v15.4h, v3.4h
             smlal2      v9.4s, v15.8h, v3.8h

             subs        x13, x13, #LOOP_OUTPUT_SIZE

             sqrshrn     v8.4h, v8.4s, #15
             sqrshrn2    v8.8h, v9.4s, #15

             sqrshrun    v8.8b, v8.8h, #VERTBITS - 8
 .elseif \comp == 2
             /* The uchar2 case:
              * This time load pairs of values into adjacent lanes in v12 - v15
              * by aliasing them as u32 data; leaving room for only four pixels,
              * so the process has to be done twice.  This also means that the
              * coefficient registers fail to align with the coefficient data
              * (eight separate pixels), so that has to be doubled-up to match.
              */
             ld4r        {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
             lsr         x14, x2, #(63 - CHUNKSHIFT)
             add         x2, x2, x3
             ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x15]
             add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
             lsr         x15, x2, #(63 - CHUNKSHIFT)
             add         x2, x2, x3
             ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x16]
             add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
             lsr         x16, x2, #(63 - CHUNKSHIFT)
             add         x2, x2, x3
             ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x17]
             add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
             lsr         x17, x2, #(63 - CHUNKSHIFT)
             add         x2, x2, x3

             /* double-up coefficients to align with component pairs */
             zip1        v16.8h, v0.8h, v0.8h
             add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
             zip1        v17.8h, v1.8h, v1.8h
             zip1        v18.8h, v2.8h, v2.8h
             zip1        v19.8h, v3.8h, v3.8h

             smull       v8.4s, v12.4h, v16.4h
             smull2      v9.4s, v12.8h, v16.8h
             smlsl       v8.4s, v13.4h, v17.4h
             smlsl2      v9.4s, v13.8h, v17.8h
             smlsl       v8.4s, v14.4h, v18.4h
             smlsl2      v9.4s, v14.8h, v18.8h
             smlal       v8.4s, v15.4h, v19.4h
             smlal2      v9.4s, v15.8h, v19.8h

             sqrshrn     v8.4h, v8.4s, #15
             sqrshrn2    v8.8h, v9.4s, #15

             ld4r        {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
             ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x15]
             ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x16]
             ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x17]

             /* double-up coefficients to align with component pairs */
             zip2        v16.8h, v0.8h, v0.8h
             zip2        v17.8h, v1.8h, v1.8h
             zip2        v18.8h, v2.8h, v2.8h
             zip2        v19.8h, v3.8h, v3.8h

             smull       v10.4s, v12.4h, v16.4h
             smull2      v11.4s, v12.8h, v16.8h
             smlsl       v10.4s, v13.4h, v17.4h
             smlsl2      v11.4s, v13.8h, v17.8h
             smlsl       v10.4s, v14.4h, v18.4h
             smlsl2      v11.4s, v14.8h, v18.8h
             smlal       v10.4s, v15.4h, v19.4h
             smlal2      v11.4s, v15.8h, v19.8h

             subs        x13, x13, #LOOP_OUTPUT_SIZE

             sqrshrn     v9.4h, v10.4s, #15
             sqrshrn2    v9.8h, v11.4s, #15

             sqrshrun     v8.8b, v8.8h, #VERTBITS - 8
             sqrshrun2    v8.16b, v9.8h, #VERTBITS - 8
 .elseif \comp == 4
             /* The uchar4 case.
              * This case is comparatively painless because four s16s are the
              * smallest addressable unit for a vmul-by-scalar.  Rather than
              * permute the data, simply arrange the multiplies to suit the way
              * the data comes in.  That's a lot of data, though, so things
              * progress in pairs of pixels at a time.
              */
             ld1         {v12.8h,v13.8h}, [x14]
             lsr         x14, x2, #(63 - CHUNKSHIFT)
             add         x2, x2, x3
             ld1         {v14.8h,v15.8h}, [x15]
             add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
             lsr         x15, x2, #(63 - CHUNKSHIFT)
             add         x2, x2, x3

             smull       v8.4s, v12.4h, v0.h[0]
             smull       v9.4s, v14.4h, v0.h[1]
             smlsl2      v8.4s, v12.8h, v1.h[0]
             smlsl2      v9.4s, v14.8h, v1.h[1]
             smlsl       v8.4s, v13.4h, v2.h[0]
             smlsl       v9.4s, v15.4h, v2.h[1]
             smlal2      v8.4s, v13.8h, v3.h[0]
             smlal2      v9.4s, v15.8h, v3.h[1]

             /* And two more...  */
             ld1         {v12.8h,v13.8h}, [x16]
             add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
             lsr         x16, x2, #(63 - CHUNKSHIFT)
             add         x2, x2, x3
             ld1         {v14.8h,v15.8h}, [x17]
             add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
             lsr         x17, x2, #(63 - CHUNKSHIFT)
             add         x2, x2, x3

             sqrshrn     v8.4h, v8.4s, #15
             add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
             sqrshrn2    v8.8h, v9.4s, #15

             smull       v10.4s, v12.4h, v0.h[2]
             smull       v11.4s, v14.4h, v0.h[3]
             smlsl2      v10.4s, v12.8h, v1.h[2]
             smlsl2      v11.4s, v14.8h, v1.h[3]
             smlsl       v10.4s, v13.4h, v2.h[2]
             smlsl       v11.4s, v15.4h, v2.h[3]
             smlal2      v10.4s, v13.8h, v3.h[2]
             smlal2      v11.4s, v15.8h, v3.h[3]

             sqrshrn     v9.4h, v10.4s, #15
             sqrshrn2    v9.8h, v11.4s, #15

             sqrshrun     v8.8b, v8.8h, #VERTBITS - 8
             sqrshrun2    v8.16b, v9.8h, #VERTBITS - 8

             /* And two more...  */
             ld1         {v12.8h,v13.8h}, [x14]
             ld1         {v14.8h,v15.8h}, [x15]

             smull       v10.4s, v12.4h, v0.h[4]
             smull       v11.4s, v14.4h, v0.h[5]
             smlsl2      v10.4s, v12.8h, v1.h[4]
             smlsl2      v11.4s, v14.8h, v1.h[5]
             smlsl       v10.4s, v13.4h, v2.h[4]
             smlsl       v11.4s, v15.4h, v2.h[5]
             smlal2      v10.4s, v13.8h, v3.h[4]
             smlal2      v11.4s, v15.8h, v3.h[5]

             /* And two more...  */
             ld1         {v12.8h,v13.8h}, [x16]
             ld1         {v14.8h,v15.8h}, [x17]

             subs        x13, x13, #LOOP_OUTPUT_SIZE

             sqrshrn     v9.4h, v10.4s, #15
             sqrshrn2    v9.8h, v11.4s, #15

             smull       v10.4s, v12.4h, v0.h[6]
             smull       v11.4s, v14.4h, v0.h[7]
             smlsl2      v10.4s, v12.8h, v1.h[6]
             smlsl2      v11.4s, v14.8h, v1.h[7]
             smlsl       v10.4s, v13.4h, v2.h[6]
             smlsl       v11.4s, v15.4h, v2.h[7]
             smlal2      v10.4s, v13.8h, v3.h[6]
             smlal2      v11.4s, v15.8h, v3.h[7]

             sqrshrn     v10.4h, v10.4s, #15
             sqrshrn2    v10.8h, v11.4s, #15

             sqrshrun     v9.8b, v9.8h, #VERTBITS - 8
             sqrshrun2    v9.16b, v10.8h, #VERTBITS - 8
 .endif
             bgt         2b      /* continue inner loop */
             /* The inner loop has already been limited to ensure that none of
              * the earlier iterations could overfill the output, so the store
              * appears within the loop but after the conditional branch (at the
              * top).  At the end, provided it won't overfill, perform the final
              * store here.  If it would, then break out to the tricky tail case
              * instead.
              */
             blt         1f
             /* Store the amount of data appropriate to the configuration of the
              * instance being assembled.
              */
 .if LOOP_OUTPUT_SIZE == 4
             st1         {v8.s}[0], [x0], #4
 .elseif LOOP_OUTPUT_SIZE == 8
             st1         {v8.8b}, [x0], #8
 .elseif LOOP_OUTPUT_SIZE == 16
             st1         {v8.16b}, [x0], #16
 .elseif LOOP_OUTPUT_SIZE == 32
             st1         {v8.16b,v9.16b}, [x0], #32
 .endif
             b           1b              /* resume outer loop */
             /* Partial tail store case:
              * Different versions of the code need different subsets of the
              * following partial stores.  Here the number of components and the
              * size of the chunk of data produced by each inner loop iteration
              * is tested to figure out whether or not each phrase is relevant.
              */
 .if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16
 1:          tst         x13, #16
             beq         1f
             st1         {v8.16b}, [x0], #16
             mov         v8.16b, v9.16b
 .endif
 .if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8
 1:          tst         x13, #8
             beq         1f
             st1         {v8.8b}, [x0], #8
             ext         v8.16b, v8.16b, v8.16b, #8
 .endif
 .if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4
 1:          tst         x13, #4
             beq         1f
             st1         {v8.s}[0], [x0], #4
             ext         v8.8b, v8.8b, v8.8b, #4
 .endif
 .if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2
 1:          tst         x13, #2
             beq         1f
             st1         {v8.h}[0], [x0], #2
             ext         v8.8b, v8.8b, v8.8b, #2
 .endif
 .if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1
 1:          tst         x13, #1
             beq         1f
             st1         {v8.b}[0], [x0], #1
 .endif
 1:
 9:          mov         sp, x19
             ld1         {v8.1d - v11.1d}, [sp], #32
             ld1         {v12.1d - v15.1d}, [sp], #32
             ldr         x19, [sp], #16
             ret
 END(rsdIntrinsicResizeB\comp\()_K)
 .endr