kernel_arm32.cc - platform/external/ruy - Git at Google

 /* Copyright 2019 Google LLC. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include "third_party/gemmlowp/profiling/instrumentation.h"
 #include "kernel.h"

 namespace ruy {

 #if (defined RUY_NEON_32) && RUY_OPT_ENABLED(RUY_OPT_ASM)

 #define RUY_ASM_LABEL_STORE_UINT8 91
 #define RUY_ASM_LABEL_STORE_INT8 92
 #define RUY_ASM_LABEL_STORE_INT16 93
 #define RUY_ASM_LABEL_STORE_INT32 94
 #define RUY_ASM_LABEL_AFTER_STORE 99

 #define RUY_OFFSET_LHS_BASE_PTR 0
 #define RUY_OFFSET_RHS_BASE_PTR 4
 #define RUY_OFFSET_DST_BASE_PTR 8
 #define RUY_OFFSET_BIAS 12
 #define RUY_OFFSET_START_ROW 16
 #define RUY_OFFSET_START_COL 20
 #define RUY_OFFSET_LAST_ROW 24
 #define RUY_OFFSET_LAST_COL 28
 #define RUY_OFFSET_DST_ROWS 32
 #define RUY_OFFSET_DST_COLS 36
 #define RUY_OFFSET_LHS_STRIDE 40
 #define RUY_OFFSET_RHS_STRIDE 44
 #define RUY_OFFSET_DST_STRIDE 48
 #define RUY_OFFSET_DEPTH 52
 #define RUY_OFFSET_CLAMP_MIN 56
 #define RUY_OFFSET_CLAMP_MAX 60
 #define RUY_OFFSET_FLAGS 64

 #define RUY_STACK_OFFSET_SIZE 96
 #define RUY_STACK_OFFSET_DST_COL_PTR 0
 #define RUY_STACK_OFFSET_DST_PTR 16
 #define RUY_STACK_OFFSET_ROW 32
 #define RUY_STACK_OFFSET_COL 48
 #define RUY_STACK_OFFSET_LHS_COL_PTR 64
 #define RUY_STACK_OFFSET_RHS_COL_PTR 80

 template <typename Params>
 void CheckOffsetsInKernelParamsFloat32(const Params&) {
   static_assert(offsetof(Params, lhs_base_ptr) == RUY_OFFSET_LHS_BASE_PTR, "");
   static_assert(offsetof(Params, rhs_base_ptr) == RUY_OFFSET_RHS_BASE_PTR, "");
   static_assert(offsetof(Params, dst_base_ptr) == RUY_OFFSET_DST_BASE_PTR, "");
   static_assert(offsetof(Params, bias) == RUY_OFFSET_BIAS, "");
   static_assert(offsetof(Params, start_row) == RUY_OFFSET_START_ROW, "");
   static_assert(offsetof(Params, start_col) == RUY_OFFSET_START_COL, "");
   static_assert(offsetof(Params, last_row) == RUY_OFFSET_LAST_ROW, "");
   static_assert(offsetof(Params, last_col) == RUY_OFFSET_LAST_COL, "");
   static_assert(offsetof(Params, dst_rows) == RUY_OFFSET_DST_ROWS, "");
   static_assert(offsetof(Params, lhs_stride) == RUY_OFFSET_LHS_STRIDE, "");
   static_assert(offsetof(Params, rhs_stride) == RUY_OFFSET_RHS_STRIDE, "");
   static_assert(offsetof(Params, dst_stride) == RUY_OFFSET_DST_STRIDE, "");
   static_assert(offsetof(Params, depth) == RUY_OFFSET_DEPTH, "");
   static_assert(offsetof(Params, clamp_min) == RUY_OFFSET_CLAMP_MIN, "");
   static_assert(offsetof(Params, clamp_max) == RUY_OFFSET_CLAMP_MAX, "");
   static_assert(offsetof(Params, flags) == RUY_OFFSET_FLAGS, "");
 }

 // Float kernel for ARM32 out-of-order cores.
 // Just like Float 64 version, except accumulate in to 8x4 block to only
 // use 16 128-bit NEON registers. This is a "first pass" kernel and not
 // tuned. It is meant to run on out-of-order CPUs like the Krait 400 or A9.
 void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) {
   CheckOffsetsInKernelParamsFloat32(params);
   gemmlowp::ScopedProfilingLabel label(
       "Kernel (kNeon, optimized for out-of-order cores)");

   const float* lhs_ptr = params.lhs_base_ptr;
   const float* rhs_ptr = params.rhs_base_ptr;
   // In ARM32 NEON, there are 16 128-bit "q" registers. These registers are
   // each composed of two 64-bit "d" registers. The asm kernel below has the
   // following NEON register allocation:
   // Registers q3 -- q10 are accumulators. During accumulation,
   // q0 -- q2 (d0 -- d5) are used to load data from LHS and RHS. q0 and q1
   // are used to load a 8x1 block of LHS, and q2 is used to load a 1x4 block
   // of RHS, like this:

   //  Register layout in "q" registers:
   //                                    RHS 1x4 block
   //                           /--------------------------\
   //                           |q2.s[0] ...      q2.s[3]  |
   //                           \--------------------------/
   //        LHS 8x1 block
   //  /---------------------\  /---------------------     \
   //  |        q0.s[0]      |  | q3.s[0]   ...    q9.s[0] |
   //  |         ...         |  |  ...               ...   |
   //  |        q0.s[3]      |  | q3.s[3]          q9.s[3] |
   //  |        q1.s[0]      |  | q4.s[0]         q10.s[0] |
   //  |         ...         |  |  ...      ...      ...   |
   //  |        q1.s[3]      |  | q4.s[3]   ..    q10.s[3] |
   //  \---------------------/  \--------------------------/
   //                             accumulators 8x4 block
   // q11, q14, q15 currently unused. q12 and q13 are used to load
   // parameters used for the post-accumulation part of the kernel.
   // For completeness, here is the register layout in "d" registers:
   //                                    RHS 1x4 block
   //                           /--------------------------\
   //                           |d4[0]     ...       d5[1] |
   //                           \--------------------------/
   //        LHS 8x1 block
   //  /---------------------\  /--------------------------\
   //  |        d0[0]        |  | d6[0]    ...      d18[0] |
   //  |         ...         |  |  ...               ...   |
   //  |        d1[1]        |  | d7[1]             d19[1] |
   //  |        d2[0]        |  | d8[0]             d20[0] |
   //  |         ...         |  |  ...      ...      ...   |
   //  |        d3[1]        |  | d9[1]     ...     d21[1] |
   //  \---------------------/  \--------------------------/
   //                             accumulators 8x4 block
   asm volatile(
 #define RUY_MAKE_ZERO(reg) "mov r0, 0\n vdup.32 " #reg ", r0\n"

         // clang-format off

         // Load the first 32 bytes of LHS and RHS data.
         // Load q0
         "vld1.32 {d0}, [%[lhs_ptr]]!\n"
         "vld1.32 {d1}, [%[lhs_ptr]]!\n"
         // Load q1
         "vld1.32 {d2}, [%[lhs_ptr]]!\n"
         "vld1.32 {d3}, [%[lhs_ptr]]!\n"
         // Load q2
         "vld1.32 {d4}, [%[rhs_ptr]]!\n"
         "vld1.32 {d5}, [%[rhs_ptr]]!\n"

         "sub sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"

         "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_BASE_PTR) "]\n"
         "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"

         "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_BASE_PTR) "]\n"
         "str r2, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"

         "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
         "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"

         "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_START_COL) "]\n"
         "str r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"

         "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
         "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"

         "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_RHS_BASE_PTR) "]\n"
         "str r2, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
         // Clear accumulators.
         RUY_MAKE_ZERO(q3)
         RUY_MAKE_ZERO(q4)
         RUY_MAKE_ZERO(q5)
         RUY_MAKE_ZERO(q6)
         RUY_MAKE_ZERO(q7)
         RUY_MAKE_ZERO(q8)
         RUY_MAKE_ZERO(q9)
         RUY_MAKE_ZERO(q10)

         // r1 is the number of levels of depth that we have already loaded
         // LHS and RHS data for. Corresponding to the initial ld1 instructions
         // above, this is currently 1.
         "mov r1, #1\n"

         // Main loop of the whole GEMM, over rows and columns of the
         // destination matrix.
         "1:\n"

         // Accumulation loop
         "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
         "cmp r1, r2\n"
         "beq 79f\n"

         "2:\n"

         "vmla.f32 q3, q0, d4[0]\n"
         "vmla.f32 q5, q0, d4[1]\n"
         "vmla.f32 q7, q0, d5[0]\n"
         "vmla.f32 q9, q0, d5[1]\n"
         "vld1.32 {d0}, [%[lhs_ptr]]!\n" // Reload LHS 1 into r0
         "vld1.32 {d1}, [%[lhs_ptr]]!\n" // Reload LHS 1 into r0

         "vmla.f32 q4, q1, d4[0]\n"
         "vmla.f32 q6, q1, d4[1]\n"
         "vmla.f32 q8, q1, d5[0]\n"
         "vmla.f32 q10, q1, d5[1]\n"
         "vld1.32 {d2}, [%[lhs_ptr]]!\n" // Reload LHS 2 into r1
         "vld1.32 {d3}, [%[lhs_ptr]]!\n" // Reload LHS 2 into r1
         "vld1.32 {d4}, [%[rhs_ptr]]!\n" // Reload RHS into r2
         "vld1.32 {d5}, [%[rhs_ptr]]!\n" // Reload RHS into r2

         "add r1, r1, #1\n"
         "cmp r1, r2\n"

         "blt 2b\n"

         "79:\n"

         // End of the inner loop on depth. Now perform the remaining
         // multiply-adds of the last level of depth, for which the LHS
         // and RHS data is already loaded.

         "vmla.f32 q3, q0, d4[0]\n"
         "vmla.f32 q5, q0, d4[1]\n"
         "vmla.f32 q7, q0, d5[0]\n"
         "vmla.f32 q9, q0, d5[1]\n"

         "vmla.f32 q4, q1, d4[0]\n"
         "vmla.f32 q6, q1, d4[1]\n"
         "vmla.f32 q8, q1, d5[0]\n"
         "vmla.f32 q10, q1, d5[1]\n"

         // End of accumulation. The registers q3 -- q10 contain the final
         // float32 accumulator values of the current 8x8 destination block.
         // We now have to compute the final values from these accumulators
         // and advance to the next 8x8 block. We intertwine
         // these two aspects whenever possible for optimal pipelining, both
         // at the data flow level (prefetch data for next block as early as
         // possible) and instruction pipelining level (some of the next-block
         // work can dual-issue with some of the final work on the current
         // block).

         // Logic to advance to the next block in preparation for the next
         // iteration of the main loop. For now, we only want to compute
         // the LHS and RHS data pointers, lhs_col_ptr and rhs_col_ptr. We are
         // not yet ready to update the values of row and col, as we still need
         // the current values for the rest of the work on the current block.

         "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
         "ldr r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
         "cmp r1, r3\n"  // Have we finished the last row?

         "bge 4f\n"      // If finished last row, go to 4
         // Not finished last row: then advance to next row.
         "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
         "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
         "add r4, r4, r1, lsl #3\n"
         "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
         "b 5f\n"
         "4:\n"  // Finished last row...
         "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
         // Go back to first row
         "str r5, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
         // Now we need to advance to the next column. If we already
         // finished the last column, then in principle we are done, however
         // we can't just return here, as we need to allow the end work of the
         // current block to complete. The good news is that at this point it
         // doesn't matter what data we load for the next column, since
         // we will exit from the main loop below before actually storing
         // anything computed from that data.
         "ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
         "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
         "cmp r8, r4\n"  // Have we finished the last column?
         "bge 5f\n" // If yes, just carry on without updating the column pointer.
         // Not finished last column: then advance to next column.
         "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
         "ldr r10, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
         "add r10, r10, r1, lsl #2\n"
         "str r10, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
         "5:\n"

         // Set the LHS and RHS data pointers to the start of the columns just
         // computed.
         "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
         "mov %[lhs_ptr], r4\n"
         "ldr r5, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
         "mov %[rhs_ptr], r5\n"

         // Load some parameters needed for the end work on current block.
         "ldrb r4, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
         "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"

         // Offset these base pointers as needed given the current row, col.
         "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
         "add r5, r1, r8, lsl #2\n"

         "tst r4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
         "it ne\n"
         "movne r1, r5\n"

         // Load 8 bias values.
         "vld1.32 {d24}, [r1]!\n"
         "vld1.32 {d25}, [r1]!\n"
         "vld1.32 {d26}, [r1]!\n"
         "vld1.32 {d27}, [r1]\n"

         // Now that we know what LHS and RHS data the next iteration of the
         // main loop will need to load, we start loading the first 32 bytes of
         // each of LHS and RHS, into q0 -- q2, as we don't need q0 -- q2 anymore
         // in the rest of the work on the current block.
         // Load q0
         "vld1.32 {d0}, [%[lhs_ptr]]!\n"
         "vld1.32 {d1}, [%[lhs_ptr]]!\n"
         // Load q1
         "vld1.32 {d2}, [%[lhs_ptr]]!\n"
         "vld1.32 {d3}, [%[lhs_ptr]]!\n"
         // Load q2
         "vld1.32 {d4}, [%[rhs_ptr]]!\n"
         "vld1.32 {d5}, [%[rhs_ptr]]!\n"


         // Perform the bias-addition (per the above, we have just folded into
         // the bias the (depth * lhs_zero_point * rhs_zero_point) term.)
         "vadd.f32 q3, q3, q12\n"
         "vadd.f32 q4, q4, q13\n"
         "vadd.f32 q5, q5, q12\n"
         "vadd.f32 q6, q6, q13\n"
         "vadd.f32 q7, q7, q12\n"
         "vadd.f32 q8, q8, q13\n"
         "vadd.f32 q9, q9, q12\n"
         "vadd.f32 q10, q10, q13\n"

         // Load the clamp_min, clamp_max bounds
         "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
         "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
         "vdup.32 q12, r2\n"  // clamp_min
         "vdup.32 q13, r3\n"  // clamp_max

         // Apply the clamp_min bound
         "vmax.f32 q3, q3, q12\n"
         "vmax.f32 q4, q4, q12\n"
         "vmax.f32 q5, q5, q12\n"
         "vmax.f32 q6, q6, q12\n"
         "vmax.f32 q7, q7, q12\n"
         "vmax.f32 q8, q8, q12\n"
         "vmax.f32 q9, q9, q12\n"
         "vmax.f32 q10, q10, q12\n"

         // Apply the clamp_max bound
         "vmin.f32 q3, q3, q13\n"
         "vmin.f32 q4, q4, q13\n"
         "vmin.f32 q5, q5, q13\n"
         "vmin.f32 q6, q6, q13\n"
         "vmin.f32 q7, q7, q13\n"
         "vmin.f32 q8, q8, q13\n"
         "vmin.f32 q9, q9, q13\n"
         "vmin.f32 q10, q10, q13\n"

         // Compute how much of the 8x4 block of destination values that
         // we have computed, fit in the destination matrix. Typically, all of
         // it fits, but when the destination matrix shape is not a multiple
         // of 8x4, there are some 8x8 blocks along the boundaries that do
         // not fit entirely.
         "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
         "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
         "sub r1, r1, r8\n"

         "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
         "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
         "sub r2, r2, r4\n"
         "mov r3, #8\n"
         "mov r5, #4\n"
         "cmp r1, #8\n"
         // Compute r1 = how many rows of the 8x4 block fit
         "it gt\n"
         "movgt r1, r3\n"
         "cmp r2, #4\n"
         // Compute r2 = how many cols of the 8x4 block fit
         "it gt\n"
         "movgt r2, r5\n"

         // Test if r1==8 && r2 == 4, i.e. if all of the 8x4 block fits.
         "cmp r1, r3\n"
         "it eq\n"
         "cmpeq r2, r5\n"
         // Yes, all of the 8x4 block fits, go to fast path.
         "beq 30f\n"
         // Not all of the 8x4 block fits.
         // Set (r3 address, r4 stride) to write to dst_tmp_buf
         "mov r3, %[dst_tmp_buf]\n"
         "mov r4, #32\n"
         "b 31f\n"
         "30:\n"
         // Yes, all of the 8x4 block fits.
         // Set (r3 address, r4 stride) to write directly to destination matrix.
         "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
         "ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
         "mov r4, r5\n"
         "31:\n"

         // Write our float values to the destination described by
         // (r3 address, r4 stride).
         // q3 = d6, d7
         "vstr d6, [r3, #0]\n"
         "vstr d7, [r3, #8]\n"
         // q4 = d8, d9
         "vstr d8, [r3, #16]\n"
         "vstr d9, [r3, #24]\n"
         "add r3, r3, r4\n"
         RUY_MAKE_ZERO(q3)
         RUY_MAKE_ZERO(q4)
         // q5 = d10, d11
         "vstr d10, [r3, #0]\n"
         "vstr d11, [r3, #8]\n"
         // q6 = d12, d13
         "vstr d12, [r3, #16]\n"
         "vstr d13, [r3, #24]\n"
         "add r3, r3, r4\n"
         RUY_MAKE_ZERO(q5)
         RUY_MAKE_ZERO(q6)
         // q7 = d14, d15
         "vstr d14, [r3, #0]\n"
         "vstr d15, [r3, #8]\n"
         // q8 = d16, d17
         "vstr d16, [r3, #16]\n"
         "vstr d17, [r3, #24]\n"
         "add r3, r3, r4\n"
         RUY_MAKE_ZERO(q7)
         RUY_MAKE_ZERO(q8)
         // q9 = d18, d19
         "vstr d18, [r3, #0]\n"
         "vstr d19, [r3, #8]\n"
         // q10 = d20, d21
         "vstr d20, [r3, #16]\n"
         "vstr d21, [r3, #24]\n"
         "add r3, r3, r4\n"
         RUY_MAKE_ZERO(q9)
         RUY_MAKE_ZERO(q10)

         // If all of the 8x4 block fits, we just finished writing it to the
         // destination, so we skip the next part.
         "beq 41f\n"
         // Not all of the 8x8 block fits in the destination matrix.  We just
         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
         // it to copy into the destination matrix the part that fits.
         "ldr r8, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
         "mov r3, %[dst_tmp_buf]\n"
         "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
         "mov r6, #0\n"
         "50:\n"
         "mov r5, #0\n"
         "51:\n"
         "ldr r10, [r3, r5, lsl #2]\n"
         "str r10, [r4, r5, lsl #2]\n"
         "add r5, r5, #1\n"
         "cmp r5, r1\n"
         "blt 51b\n"
         "add r6, r6, #1\n"
         "add r3, r3, #32\n"
         "add r4, r4, r8\n"
         // r2 = how many cols of the 8x4 block fit
         "cmp r6, r2\n"
         "blt 50b\n"
         "41:\n"
         // Load dst_ptr, increment, and write back.
         "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
         "add r4, r4, #32\n"
         "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
         // At this point we have completely finished writing values to the
         // destination matrix for the current block.

         // Reload some params --- we had used r3, r5, r10 for a few other things
         // since the last time we had loaded them.
         "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
         "ldr r6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
         "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"

         // Move to the next block of the destination matrix, for the next iter
         // of the main loop.  Notice that lhs_col_ptr, rhs_col_ptr have already
         // been updated earlier.
         // Have we reached the end row?
         "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
         "cmp r8, r3\n"

         "beq 20f\n"  // yes, end row.
         // Not end row. Move to the next row.
         "add r8, r8, #8\n"
         // Store new value of row
         "str r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"

         "b 21f\n"
         "20:\n"
         // Was already at end row.
         // Move back to first row.
         "str r6, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
         // Move to the next column.
         "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
         "add r4, r4, #4\n"
         "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"

         "ldr r8, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
         "ldr r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
         // Increment dst_col_ptr by 4 * dst_stride (i.e. 4 columns)
         "add r1, r1, r8, lsl #2\n"
         // Store dst_col_ptr
         "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
         // Store dst_ptr
         "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
         "21:\n"

         // Main loop exit condition: have we hit the end column?
         "ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
         "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
         "cmp r8, r4\n"

         // w1 is the number of levels of depth that we have already loaded
         // LHS and RHS data for. Corresponding to the initial ld1 instructions
         // above, this is currently 1.
         "mov r1, #1\n"

         "ble 1b\n"

         // Restore stack pointer.
         "add sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"

         // clang-format on
         : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr)
         : [ params ] "r"(&params), [dst_tmp_buf] "r"(params.dst_tmp_buf)
         : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r8", "r10", "cc",
           "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8",
           "d9", "d10", "d12", "d13", "d14", "d15", "d16", "d17", "d18","d19",
           "d20", "d21", "d22", "d23", "d24", "d25", "d26");
 }

 #undef RUY_OFFSET_BIAS
 #undef RUY_OFFSET_FLAGS
 #undef RUY_OFFSET_LHS_BASE_PTR
 #undef RUY_OFFSET_CLAMP_MIN
 #undef RUY_OFFSET_CLAMP_MAX
 #undef RUY_OFFSET_START_ROW
 #undef RUY_OFFSET_LAST_ROW
 #undef RUY_OFFSET_LAST_COL
 #undef RUY_OFFSET_LHS_STRIDE
 #undef RUY_OFFSET_RHS_STRIDE
 #undef RUY_OFFSET_DST_STRIDE
 #undef RUY_OFFSET_DEPTH
 #undef RUY_OFFSET_START_COL
 #undef RUY_OFFSET_RHS_BASE_PTR
 #undef RUY_OFFSET_DST_BASE_PTR

 #endif  // (defined RUY_NEON_32) && (RUY_OPT_ENABLED(RUY_OPT_ASM)
 }  // namespace ruy
	/* Copyright 2019 Google LLC. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/

	#include "third_party/gemmlowp/profiling/instrumentation.h"
	#include "kernel.h"

	namespace ruy {

	#if (defined RUY_NEON_32) && RUY_OPT_ENABLED(RUY_OPT_ASM)

	#define RUY_ASM_LABEL_STORE_UINT8 91
	#define RUY_ASM_LABEL_STORE_INT8 92
	#define RUY_ASM_LABEL_STORE_INT16 93
	#define RUY_ASM_LABEL_STORE_INT32 94
	#define RUY_ASM_LABEL_AFTER_STORE 99

	#define RUY_OFFSET_LHS_BASE_PTR 0
	#define RUY_OFFSET_RHS_BASE_PTR 4
	#define RUY_OFFSET_DST_BASE_PTR 8
	#define RUY_OFFSET_BIAS 12
	#define RUY_OFFSET_START_ROW 16
	#define RUY_OFFSET_START_COL 20
	#define RUY_OFFSET_LAST_ROW 24
	#define RUY_OFFSET_LAST_COL 28
	#define RUY_OFFSET_DST_ROWS 32
	#define RUY_OFFSET_DST_COLS 36
	#define RUY_OFFSET_LHS_STRIDE 40
	#define RUY_OFFSET_RHS_STRIDE 44
	#define RUY_OFFSET_DST_STRIDE 48
	#define RUY_OFFSET_DEPTH 52
	#define RUY_OFFSET_CLAMP_MIN 56
	#define RUY_OFFSET_CLAMP_MAX 60
	#define RUY_OFFSET_FLAGS 64

	#define RUY_STACK_OFFSET_SIZE 96
	#define RUY_STACK_OFFSET_DST_COL_PTR 0
	#define RUY_STACK_OFFSET_DST_PTR 16
	#define RUY_STACK_OFFSET_ROW 32
	#define RUY_STACK_OFFSET_COL 48
	#define RUY_STACK_OFFSET_LHS_COL_PTR 64
	#define RUY_STACK_OFFSET_RHS_COL_PTR 80

	template <typename Params>
	void CheckOffsetsInKernelParamsFloat32(const Params&) {
	static_assert(offsetof(Params, lhs_base_ptr) == RUY_OFFSET_LHS_BASE_PTR, "");
	static_assert(offsetof(Params, rhs_base_ptr) == RUY_OFFSET_RHS_BASE_PTR, "");
	static_assert(offsetof(Params, dst_base_ptr) == RUY_OFFSET_DST_BASE_PTR, "");
	static_assert(offsetof(Params, bias) == RUY_OFFSET_BIAS, "");
	static_assert(offsetof(Params, start_row) == RUY_OFFSET_START_ROW, "");
	static_assert(offsetof(Params, start_col) == RUY_OFFSET_START_COL, "");
	static_assert(offsetof(Params, last_row) == RUY_OFFSET_LAST_ROW, "");
	static_assert(offsetof(Params, last_col) == RUY_OFFSET_LAST_COL, "");
	static_assert(offsetof(Params, dst_rows) == RUY_OFFSET_DST_ROWS, "");
	static_assert(offsetof(Params, lhs_stride) == RUY_OFFSET_LHS_STRIDE, "");
	static_assert(offsetof(Params, rhs_stride) == RUY_OFFSET_RHS_STRIDE, "");
	static_assert(offsetof(Params, dst_stride) == RUY_OFFSET_DST_STRIDE, "");
	static_assert(offsetof(Params, depth) == RUY_OFFSET_DEPTH, "");
	static_assert(offsetof(Params, clamp_min) == RUY_OFFSET_CLAMP_MIN, "");
	static_assert(offsetof(Params, clamp_max) == RUY_OFFSET_CLAMP_MAX, "");
	static_assert(offsetof(Params, flags) == RUY_OFFSET_FLAGS, "");
	}

	// Float kernel for ARM32 out-of-order cores.
	// Just like Float 64 version, except accumulate in to 8x4 block to only
	// use 16 128-bit NEON registers. This is a "first pass" kernel and not
	// tuned. It is meant to run on out-of-order CPUs like the Krait 400 or A9.
	void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) {
	CheckOffsetsInKernelParamsFloat32(params);
	gemmlowp::ScopedProfilingLabel label(
	"Kernel (kNeon, optimized for out-of-order cores)");

	const float* lhs_ptr = params.lhs_base_ptr;
	const float* rhs_ptr = params.rhs_base_ptr;
	// In ARM32 NEON, there are 16 128-bit "q" registers. These registers are
	// each composed of two 64-bit "d" registers. The asm kernel below has the
	// following NEON register allocation:
	// Registers q3 -- q10 are accumulators. During accumulation,
	// q0 -- q2 (d0 -- d5) are used to load data from LHS and RHS. q0 and q1
	// are used to load a 8x1 block of LHS, and q2 is used to load a 1x4 block
	// of RHS, like this:

	// Register layout in "q" registers:
	// RHS 1x4 block
	// /--------------------------\
	// \|q2.s[0] ... q2.s[3] \|
	// \--------------------------/
	// LHS 8x1 block
	// /---------------------\ /--------------------- \
	// \| q0.s[0] \| \| q3.s[0] ... q9.s[0] \|
	// \| ... \| \| ... ... \|
	// \| q0.s[3] \| \| q3.s[3] q9.s[3] \|
	// \| q1.s[0] \| \| q4.s[0] q10.s[0] \|
	// \| ... \| \| ... ... ... \|
	// \| q1.s[3] \| \| q4.s[3] .. q10.s[3] \|
	// \---------------------/ \--------------------------/
	// accumulators 8x4 block
	// q11, q14, q15 currently unused. q12 and q13 are used to load
	// parameters used for the post-accumulation part of the kernel.
	// For completeness, here is the register layout in "d" registers:
	// RHS 1x4 block
	// /--------------------------\
	// \|d4[0] ... d5[1] \|
	// \--------------------------/
	// LHS 8x1 block
	// /---------------------\ /--------------------------\
	// \| d0[0] \| \| d6[0] ... d18[0] \|
	// \| ... \| \| ... ... \|
	// \| d1[1] \| \| d7[1] d19[1] \|
	// \| d2[0] \| \| d8[0] d20[0] \|
	// \| ... \| \| ... ... ... \|
	// \| d3[1] \| \| d9[1] ... d21[1] \|
	// \---------------------/ \--------------------------/
	// accumulators 8x4 block
	asm volatile(
	#define RUY_MAKE_ZERO(reg) "mov r0, 0\n vdup.32 " #reg ", r0\n"

	// clang-format off

	// Load the first 32 bytes of LHS and RHS data.
	// Load q0
	"vld1.32 {d0}, [%[lhs_ptr]]!\n"
	"vld1.32 {d1}, [%[lhs_ptr]]!\n"
	// Load q1
	"vld1.32 {d2}, [%[lhs_ptr]]!\n"
	"vld1.32 {d3}, [%[lhs_ptr]]!\n"
	// Load q2
	"vld1.32 {d4}, [%[rhs_ptr]]!\n"
	"vld1.32 {d5}, [%[rhs_ptr]]!\n"

	"sub sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"

	"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_BASE_PTR) "]\n"
	"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"

	"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_BASE_PTR) "]\n"
	"str r2, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"

	"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
	"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"

	"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_START_COL) "]\n"
	"str r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"

	"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
	"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"

	"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_RHS_BASE_PTR) "]\n"
	"str r2, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
	// Clear accumulators.
	RUY_MAKE_ZERO(q3)
	RUY_MAKE_ZERO(q4)
	RUY_MAKE_ZERO(q5)
	RUY_MAKE_ZERO(q6)
	RUY_MAKE_ZERO(q7)
	RUY_MAKE_ZERO(q8)
	RUY_MAKE_ZERO(q9)
	RUY_MAKE_ZERO(q10)

	// r1 is the number of levels of depth that we have already loaded
	// LHS and RHS data for. Corresponding to the initial ld1 instructions
	// above, this is currently 1.
	"mov r1, #1\n"

	// Main loop of the whole GEMM, over rows and columns of the
	// destination matrix.
	"1:\n"

	// Accumulation loop
	"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
	"cmp r1, r2\n"
	"beq 79f\n"

	"2:\n"

	"vmla.f32 q3, q0, d4[0]\n"
	"vmla.f32 q5, q0, d4[1]\n"
	"vmla.f32 q7, q0, d5[0]\n"
	"vmla.f32 q9, q0, d5[1]\n"
	"vld1.32 {d0}, [%[lhs_ptr]]!\n" // Reload LHS 1 into r0
	"vld1.32 {d1}, [%[lhs_ptr]]!\n" // Reload LHS 1 into r0

	"vmla.f32 q4, q1, d4[0]\n"
	"vmla.f32 q6, q1, d4[1]\n"
	"vmla.f32 q8, q1, d5[0]\n"
	"vmla.f32 q10, q1, d5[1]\n"
	"vld1.32 {d2}, [%[lhs_ptr]]!\n" // Reload LHS 2 into r1
	"vld1.32 {d3}, [%[lhs_ptr]]!\n" // Reload LHS 2 into r1
	"vld1.32 {d4}, [%[rhs_ptr]]!\n" // Reload RHS into r2
	"vld1.32 {d5}, [%[rhs_ptr]]!\n" // Reload RHS into r2

	"add r1, r1, #1\n"
	"cmp r1, r2\n"

	"blt 2b\n"

	"79:\n"

	// End of the inner loop on depth. Now perform the remaining
	// multiply-adds of the last level of depth, for which the LHS
	// and RHS data is already loaded.

	"vmla.f32 q3, q0, d4[0]\n"
	"vmla.f32 q5, q0, d4[1]\n"
	"vmla.f32 q7, q0, d5[0]\n"
	"vmla.f32 q9, q0, d5[1]\n"

	"vmla.f32 q4, q1, d4[0]\n"
	"vmla.f32 q6, q1, d4[1]\n"
	"vmla.f32 q8, q1, d5[0]\n"
	"vmla.f32 q10, q1, d5[1]\n"

	// End of accumulation. The registers q3 -- q10 contain the final
	// float32 accumulator values of the current 8x8 destination block.
	// We now have to compute the final values from these accumulators
	// and advance to the next 8x8 block. We intertwine
	// these two aspects whenever possible for optimal pipelining, both
	// at the data flow level (prefetch data for next block as early as
	// possible) and instruction pipelining level (some of the next-block
	// work can dual-issue with some of the final work on the current
	// block).

	// Logic to advance to the next block in preparation for the next
	// iteration of the main loop. For now, we only want to compute
	// the LHS and RHS data pointers, lhs_col_ptr and rhs_col_ptr. We are
	// not yet ready to update the values of row and col, as we still need
	// the current values for the rest of the work on the current block.

	"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
	"ldr r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
	"cmp r1, r3\n" // Have we finished the last row?

	"bge 4f\n" // If finished last row, go to 4
	// Not finished last row: then advance to next row.
	"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
	"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
	"add r4, r4, r1, lsl #3\n"
	"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
	"b 5f\n"
	"4:\n" // Finished last row...
	"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
	// Go back to first row
	"str r5, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
	// Now we need to advance to the next column. If we already
	// finished the last column, then in principle we are done, however
	// we can't just return here, as we need to allow the end work of the
	// current block to complete. The good news is that at this point it
	// doesn't matter what data we load for the next column, since
	// we will exit from the main loop below before actually storing
	// anything computed from that data.
	"ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
	"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
	"cmp r8, r4\n" // Have we finished the last column?
	"bge 5f\n" // If yes, just carry on without updating the column pointer.
	// Not finished last column: then advance to next column.
	"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
	"ldr r10, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
	"add r10, r10, r1, lsl #2\n"
	"str r10, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
	"5:\n"

	// Set the LHS and RHS data pointers to the start of the columns just
	// computed.
	"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
	"mov %[lhs_ptr], r4\n"
	"ldr r5, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
	"mov %[rhs_ptr], r5\n"

	// Load some parameters needed for the end work on current block.
	"ldrb r4, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
	"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"

	// Offset these base pointers as needed given the current row, col.
	"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
	"add r5, r1, r8, lsl #2\n"

	"tst r4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
	"it ne\n"
	"movne r1, r5\n"

	// Load 8 bias values.
	"vld1.32 {d24}, [r1]!\n"
	"vld1.32 {d25}, [r1]!\n"
	"vld1.32 {d26}, [r1]!\n"
	"vld1.32 {d27}, [r1]\n"

	// Now that we know what LHS and RHS data the next iteration of the
	// main loop will need to load, we start loading the first 32 bytes of
	// each of LHS and RHS, into q0 -- q2, as we don't need q0 -- q2 anymore
	// in the rest of the work on the current block.
	// Load q0
	"vld1.32 {d0}, [%[lhs_ptr]]!\n"
	"vld1.32 {d1}, [%[lhs_ptr]]!\n"
	// Load q1
	"vld1.32 {d2}, [%[lhs_ptr]]!\n"
	"vld1.32 {d3}, [%[lhs_ptr]]!\n"
	// Load q2
	"vld1.32 {d4}, [%[rhs_ptr]]!\n"
	"vld1.32 {d5}, [%[rhs_ptr]]!\n"


	// Perform the bias-addition (per the above, we have just folded into
	// the bias the (depth * lhs_zero_point * rhs_zero_point) term.)
	"vadd.f32 q3, q3, q12\n"
	"vadd.f32 q4, q4, q13\n"
	"vadd.f32 q5, q5, q12\n"
	"vadd.f32 q6, q6, q13\n"
	"vadd.f32 q7, q7, q12\n"
	"vadd.f32 q8, q8, q13\n"
	"vadd.f32 q9, q9, q12\n"
	"vadd.f32 q10, q10, q13\n"

	// Load the clamp_min, clamp_max bounds
	"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
	"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
	"vdup.32 q12, r2\n" // clamp_min
	"vdup.32 q13, r3\n" // clamp_max

	// Apply the clamp_min bound
	"vmax.f32 q3, q3, q12\n"
	"vmax.f32 q4, q4, q12\n"
	"vmax.f32 q5, q5, q12\n"
	"vmax.f32 q6, q6, q12\n"
	"vmax.f32 q7, q7, q12\n"
	"vmax.f32 q8, q8, q12\n"
	"vmax.f32 q9, q9, q12\n"
	"vmax.f32 q10, q10, q12\n"

	// Apply the clamp_max bound
	"vmin.f32 q3, q3, q13\n"
	"vmin.f32 q4, q4, q13\n"
	"vmin.f32 q5, q5, q13\n"
	"vmin.f32 q6, q6, q13\n"
	"vmin.f32 q7, q7, q13\n"
	"vmin.f32 q8, q8, q13\n"
	"vmin.f32 q9, q9, q13\n"
	"vmin.f32 q10, q10, q13\n"

	// Compute how much of the 8x4 block of destination values that
	// we have computed, fit in the destination matrix. Typically, all of
	// it fits, but when the destination matrix shape is not a multiple
	// of 8x4, there are some 8x8 blocks along the boundaries that do
	// not fit entirely.
	"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
	"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
	"sub r1, r1, r8\n"

	"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
	"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
	"sub r2, r2, r4\n"
	"mov r3, #8\n"
	"mov r5, #4\n"
	"cmp r1, #8\n"
	// Compute r1 = how many rows of the 8x4 block fit
	"it gt\n"
	"movgt r1, r3\n"
	"cmp r2, #4\n"
	// Compute r2 = how many cols of the 8x4 block fit
	"it gt\n"
	"movgt r2, r5\n"

	// Test if r1==8 && r2 == 4, i.e. if all of the 8x4 block fits.
	"cmp r1, r3\n"
	"it eq\n"
	"cmpeq r2, r5\n"
	// Yes, all of the 8x4 block fits, go to fast path.
	"beq 30f\n"
	// Not all of the 8x4 block fits.
	// Set (r3 address, r4 stride) to write to dst_tmp_buf
	"mov r3, %[dst_tmp_buf]\n"
	"mov r4, #32\n"
	"b 31f\n"
	"30:\n"
	// Yes, all of the 8x4 block fits.
	// Set (r3 address, r4 stride) to write directly to destination matrix.
	"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
	"ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
	"mov r4, r5\n"
	"31:\n"

	// Write our float values to the destination described by
	// (r3 address, r4 stride).
	// q3 = d6, d7
	"vstr d6, [r3, #0]\n"
	"vstr d7, [r3, #8]\n"
	// q4 = d8, d9
	"vstr d8, [r3, #16]\n"
	"vstr d9, [r3, #24]\n"
	"add r3, r3, r4\n"
	RUY_MAKE_ZERO(q3)
	RUY_MAKE_ZERO(q4)
	// q5 = d10, d11
	"vstr d10, [r3, #0]\n"
	"vstr d11, [r3, #8]\n"
	// q6 = d12, d13
	"vstr d12, [r3, #16]\n"
	"vstr d13, [r3, #24]\n"
	"add r3, r3, r4\n"
	RUY_MAKE_ZERO(q5)
	RUY_MAKE_ZERO(q6)
	// q7 = d14, d15
	"vstr d14, [r3, #0]\n"
	"vstr d15, [r3, #8]\n"
	// q8 = d16, d17
	"vstr d16, [r3, #16]\n"
	"vstr d17, [r3, #24]\n"
	"add r3, r3, r4\n"
	RUY_MAKE_ZERO(q7)
	RUY_MAKE_ZERO(q8)
	// q9 = d18, d19
	"vstr d18, [r3, #0]\n"
	"vstr d19, [r3, #8]\n"
	// q10 = d20, d21
	"vstr d20, [r3, #16]\n"
	"vstr d21, [r3, #24]\n"
	"add r3, r3, r4\n"
	RUY_MAKE_ZERO(q9)
	RUY_MAKE_ZERO(q10)

	// If all of the 8x4 block fits, we just finished writing it to the
	// destination, so we skip the next part.
	"beq 41f\n"
	// Not all of the 8x8 block fits in the destination matrix. We just
	// wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
	// it to copy into the destination matrix the part that fits.
	"ldr r8, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
	"mov r3, %[dst_tmp_buf]\n"
	"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
	"mov r6, #0\n"
	"50:\n"
	"mov r5, #0\n"
	"51:\n"
	"ldr r10, [r3, r5, lsl #2]\n"
	"str r10, [r4, r5, lsl #2]\n"
	"add r5, r5, #1\n"
	"cmp r5, r1\n"
	"blt 51b\n"
	"add r6, r6, #1\n"
	"add r3, r3, #32\n"
	"add r4, r4, r8\n"
	// r2 = how many cols of the 8x4 block fit
	"cmp r6, r2\n"
	"blt 50b\n"
	"41:\n"
	// Load dst_ptr, increment, and write back.
	"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
	"add r4, r4, #32\n"
	"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
	// At this point we have completely finished writing values to the
	// destination matrix for the current block.

	// Reload some params --- we had used r3, r5, r10 for a few other things
	// since the last time we had loaded them.
	"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
	"ldr r6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
	"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"

	// Move to the next block of the destination matrix, for the next iter
	// of the main loop. Notice that lhs_col_ptr, rhs_col_ptr have already
	// been updated earlier.
	// Have we reached the end row?
	"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
	"cmp r8, r3\n"

	"beq 20f\n" // yes, end row.
	// Not end row. Move to the next row.
	"add r8, r8, #8\n"
	// Store new value of row
	"str r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"

	"b 21f\n"
	"20:\n"
	// Was already at end row.
	// Move back to first row.
	"str r6, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
	// Move to the next column.
	"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
	"add r4, r4, #4\n"
	"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"

	"ldr r8, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
	"ldr r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
	// Increment dst_col_ptr by 4 * dst_stride (i.e. 4 columns)
	"add r1, r1, r8, lsl #2\n"
	// Store dst_col_ptr
	"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
	// Store dst_ptr
	"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
	"21:\n"

	// Main loop exit condition: have we hit the end column?
	"ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
	"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
	"cmp r8, r4\n"

	// w1 is the number of levels of depth that we have already loaded
	// LHS and RHS data for. Corresponding to the initial ld1 instructions
	// above, this is currently 1.
	"mov r1, #1\n"

	"ble 1b\n"

	// Restore stack pointer.
	"add sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"

	// clang-format on
	: [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr)
	: [ params ] "r"(&params), [dst_tmp_buf] "r"(params.dst_tmp_buf)
	: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r8", "r10", "cc",
	"memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8",
	"d9", "d10", "d12", "d13", "d14", "d15", "d16", "d17", "d18","d19",
	"d20", "d21", "d22", "d23", "d24", "d25", "d26");
	}

	#undef RUY_OFFSET_BIAS
	#undef RUY_OFFSET_FLAGS
	#undef RUY_OFFSET_LHS_BASE_PTR
	#undef RUY_OFFSET_CLAMP_MIN
	#undef RUY_OFFSET_CLAMP_MAX
	#undef RUY_OFFSET_START_ROW
	#undef RUY_OFFSET_LAST_ROW
	#undef RUY_OFFSET_LAST_COL
	#undef RUY_OFFSET_LHS_STRIDE
	#undef RUY_OFFSET_RHS_STRIDE
	#undef RUY_OFFSET_DST_STRIDE
	#undef RUY_OFFSET_DEPTH
	#undef RUY_OFFSET_START_COL
	#undef RUY_OFFSET_RHS_BASE_PTR
	#undef RUY_OFFSET_DST_BASE_PTR

	#endif // (defined RUY_NEON_32) && (RUY_OPT_ENABLED(RUY_OPT_ASM)
	} // namespace ruy