Ruy - ARM32 asm packing kernel

PiperOrigin-RevId: 258974397
diff --git a/pack.cc b/pack.cc
index b7d6e63..901ddc3 100644
--- a/pack.cc
+++ b/pack.cc
@@ -1045,6 +1045,9 @@
         "v27", "v28", "v29", "v30", "v31");
 }
 
+#endif  // RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+#if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
 void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1,
                              const float* src_ptr2, const float* src_ptr3,
                              int src_inc0, int src_inc1, int src_inc2,
@@ -1178,7 +1181,201 @@
         "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
         "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
 }
+#endif
 
+#if RUY_PLATFORM(NEON_32) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1,
+                             const float* src_ptr2, const float* src_ptr3,
+                             int src_inc, int src_rows, int src_zero_point,
+                             float* packed_ptr, int start_col, int end_col,
+                             int output_stride) {
+  gemmlowp::ScopedProfilingLabel label(
+      "Pack (kNeon, optimized for out-of-order cores)");
+  asm volatile(
+      // clang-format off
+          "mov r1, #0\n"
+          "and r2, %[rows], #-4\n"
+          "cmp r1, r2\n"
+          "beq 3f\n"
+#define RUY_LOAD_FOUR_BY_FOUR()               \
+  /* Load q0 */                               \
+  "vldr d0, [%[src_ptr0], #0]\n"              \
+  "vldr d1, [%[src_ptr0], #8]\n"              \
+  /* if src_inc0 != 0, add 16 to src_ptr0 */  \
+  "and r3, %[src_inc], #1\n"                  \
+  "add %[src_ptr0], %[src_ptr0], r3, lsl #4\n"\
+  /* Load q1 */                               \
+  "vldr d2, [%[src_ptr1], #0]\n"              \
+  "vldr d3, [%[src_ptr1], #8]\n"              \
+  /* if src_inc1 != 0, add 16 to src_ptr0 */  \
+  "and r3, %[src_inc], #2\n"                  \
+  "add %[src_ptr1], %[src_ptr1], r3, lsl #3\n"\
+  /* Load q2 */                               \
+  "vldr d4, [%[src_ptr2], #0]\n"              \
+  "vldr d5, [%[src_ptr2], #8]\n"              \
+  /* if src_inc2 != 0, add 16 to src_ptr0 */  \
+  "and r3, %[src_inc], #4\n"                  \
+  "add %[src_ptr2], %[src_ptr2], r3, lsl #2\n"\
+  /* Load q3 */                               \
+  "vldr d6, [%[src_ptr3], #0]\n"              \
+  "vldr d7, [%[src_ptr3], #8]\n"              \
+  /* if src_inc3 != 0, add 16 to src_ptr0 */  \
+  "and r3, %[src_inc], #8\n"                  \
+  "add %[src_ptr3], %[src_ptr3], r3, lsl #1\n"\
+
+          RUY_LOAD_FOUR_BY_FOUR()
+          "add r1, r1, #4\n"
+          "cmp r1, r2\n"
+
+          "beq 2f\n"
+
+          "1:\n"
+          "add r1, r1, #4\n"
+
+          // Transpose 4x4 matrix.
+          "vzip.32 q0, q1\n"
+          "vzip.32 q2, q3\n"
+
+          "vtrn.32 q0, q2\n"
+          "vtrn.32 q1, q3\n"
+
+          "vzip.32 q0, q2\n"
+          "vzip.32 q1, q3\n"
+
+          "vmov q8, q0\n"
+          "vmov q9, q1\n"
+          "vmov q10, q2\n"
+          "vmov q11, q3\n"
+
+          RUY_LOAD_FOUR_BY_FOUR()
+#undef RUY_LOAD_FOUR_BY_FOUR
+
+#define RUY_STORE_FOUR_BY_FOUR()                  \
+  /* Store q8, q10, q9, q11 */                    \
+  /* q8 = d16, d17 */                             \
+  "vstr d16, [%[packed_ptr], #0]\n"               \
+  "vstr d17, [%[packed_ptr], #8]\n"               \
+  /* q10 = d20, d21 */                            \
+  "add %[packed_ptr], %[packed_ptr], %[stride]\n" \
+  "vstr d20, [%[packed_ptr], #0]\n"               \
+  "vstr d21, [%[packed_ptr], #8]\n"               \
+  /* q9 = d18, d19 */                             \
+  "add %[packed_ptr], %[packed_ptr], %[stride]\n" \
+  "vstr d18, [%[packed_ptr], #0]\n"               \
+  "vstr d19, [%[packed_ptr], #8]\n"               \
+  /* q11 = d22, d23 */                            \
+  "add %[packed_ptr], %[packed_ptr], %[stride]\n" \
+  "vstr d22, [%[packed_ptr], #0]\n"               \
+  "vstr d23, [%[packed_ptr], #8]\n"               \
+  "add %[packed_ptr], %[packed_ptr], %[stride]\n" \
+
+          RUY_STORE_FOUR_BY_FOUR()
+          "cmp r1, r2\n"
+
+          "bne 1b\n"
+
+          "2:\n"
+
+          // Transpose 4x4 matrix.
+          "vzip.32 q0, q1\n"
+          "vzip.32 q2, q3\n"
+
+          "vtrn.32 q0, q2\n"
+          "vtrn.32 q1, q3\n"
+
+          "vzip.32 q0, q2\n"
+          "vzip.32 q1, q3\n"
+
+          "vmov q8, q0\n"
+          "vmov q9, q1\n"
+          "vmov q10, q2\n"
+          "vmov q11, q3\n"
+
+          RUY_STORE_FOUR_BY_FOUR()
+#undef RUY_STORE_FOUR_BY_FOUR
+          "3:\n"
+
+          "ands r2, %[rows], #3\n"
+          "beq 4f\n"
+          "mov r0, 0\n"
+          // Zero out q0 - q3
+          "vdup.32 q0, r0\n"
+          "vdup.32 q1, r0\n"
+          "vdup.32 q2, r0\n"
+          "vdup.32 q3, r0\n"
+#define RUY_LOAD_ONE_ROW_FIRST_HALF(R, I)    \
+  "cmp r2, #" #R "\n"                        \
+  "beq 5f\n"                                 \
+  "vld1.32 { d0[" #I "] }, [%[src_ptr0]]!\n" \
+  "vld1.32 { d2[" #I "] }, [%[src_ptr1]]!\n" \
+  "vld1.32 { d4[" #I "] }, [%[src_ptr2]]!\n" \
+  "vld1.32 { d6[" #I "] }, [%[src_ptr3]]!\n"
+
+#define RUY_LOAD_ONE_ROW_SECOND_HALF(R, I)      \
+  "vld1.32 { d1[" #I "] }, [%[src_ptr0]]!\n" \
+  "vld1.32 { d3[" #I "] }, [%[src_ptr1]]!\n" \
+  "vld1.32 { d5[" #I "] }, [%[src_ptr2]]!\n" \
+  "vld1.32 { d7[" #I "] }, [%[src_ptr3]]!\n"
+
+          RUY_LOAD_ONE_ROW_FIRST_HALF(0, 0)
+          RUY_LOAD_ONE_ROW_FIRST_HALF(1, 1)
+          RUY_LOAD_ONE_ROW_SECOND_HALF(2, 0)
+          RUY_LOAD_ONE_ROW_SECOND_HALF(3, 1)
+#undef RUY_LOAD_ONE_ROW_SECOND_HALF
+#undef RUY_LOAD_ONE_ROW_FIRST_HALF
+          "5:\n"
+
+          // Transpose 4x4 matrix.
+          "vzip.32 q0, q1\n"
+          "vzip.32 q2, q3\n"
+
+          "vtrn.32 q0, q2\n"
+          "vtrn.32 q1, q3\n"
+
+          "vzip.32 q0, q2\n"
+          "vzip.32 q1, q3\n"
+
+          "vmov q8, q0\n"
+          "vmov q9, q1\n"
+          "vmov q10, q2\n"
+          "vmov q11, q3\n"
+
+          "mov r1, #32\n"
+
+#define RUY_STORE_ONE_ROW(ROW, REGISTER1, REGISTER2)      \
+          "cmp r2, #" #ROW "\n"                           \
+          "beq 4f\n"                                      \
+          "vstr " #REGISTER1 ", [%[packed_ptr]]\n"    \
+          "vstr " #REGISTER2 ", [%[packed_ptr], #8]\n"    \
+          "add %[packed_ptr], %[packed_ptr], %[stride]\n"
+
+          // Store q8
+          RUY_STORE_ONE_ROW(0, d16, d17)
+          // Store q10
+          RUY_STORE_ONE_ROW(1, d20, d21)
+          // Store q9
+          RUY_STORE_ONE_ROW(2, d18, d19)
+          // Store q11
+          RUY_STORE_ONE_ROW(3, d22, d23)
+
+#undef RUY_STORE_ONE_ROW
+
+          "4:\n"
+
+      // clang-format on
+      : [ src_ptr0 ] "+r"(src_ptr0), [ src_ptr1 ] "+r"(src_ptr1),
+        [ src_ptr2 ] "+r"(src_ptr2), [ src_ptr3 ] "+r"(src_ptr3),
+        [ packed_ptr ] "+r"(packed_ptr)
+      : [ src_inc ] "r"(static_cast<std::int64_t>(src_inc)),
+        [ rows ] "r"(src_rows), [ stride ] "r"(output_stride)
+      : "cc", "memory", "r0", "r1", "r2", "r3", "d0", "d1", "d2", "d3",
+        "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13",
+        "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23");
+}
+
+#endif  // (RUY_PLATFORM(NEON_32)
+
+#if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
 void PackFloatNeonInOrder(const float* src_ptr0, const float* src_ptr1,
                           const float* src_ptr2, const float* src_ptr3,
                           int src_inc0, int src_inc1, int src_inc2,
diff --git a/pack.h b/pack.h
index 4dc5833..241a665 100644
--- a/pack.h
+++ b/pack.h
@@ -157,10 +157,11 @@
 };
 
 RUY_INHERIT_PACK(Path::kStandardCpp, Path::kNeon)
+#if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
 RUY_INHERIT_PACK(Path::kNeon, Path::kNeonDotprod)
+#endif
 
 #if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
-
 void Pack8bitNeonOutOfOrder(const void* src_ptr0, const void* src_ptr1,
                             const void* src_ptr2, const void* src_ptr3,
                             int src_inc0, int src_inc1, int src_inc2,
@@ -317,7 +318,9 @@
     }
   }
 };
+#endif  // (RUY_PLATFORM(NEON_64)&& RUY_OPT_ENABLED(RUY_OPT_ASM)
 
+#if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
 void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1,
                              const float* src_ptr2, const float* src_ptr3,
                              int src_inc0, int src_inc1, int src_inc2,
@@ -329,6 +332,17 @@
                           int src_inc3, int src_rows, int src_zero_point,
                           float* packed_ptr, int start_col, int end_col);
 
+#elif RUY_PLATFORM(NEON_32) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1,
+                             const float* src_ptr2, const float* src_ptr3,
+                             int src_inc, int src_rows, int src_zero_point,
+                             float* packed_ptr, int start_col, int end_col,
+                             int stride);
+#endif  // (RUY_PLATFORM(NEON_64)&& RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+#if (RUY_PLATFORM(NEON_32) || RUY_PLATFORM(NEON_64)) && \
+    RUY_OPT_ENABLED(RUY_OPT_ASM)
+
 template <>
 struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kRowMajor, 1, 8>, float,
                 float, float> {
@@ -370,6 +384,7 @@
       float* packed_ptr = packed_matrix->data +
                           packed_matrix->layout.stride * (block_col & ~7) +
                           ((block_col & 4));
+#if RUY_PLATFORM(NEON_64)
       if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
         PackFloatNeonInOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc0,
                              src_inc1, src_inc2, src_inc3,
@@ -381,11 +396,88 @@
                                 src_matrix.layout.rows, src_matrix.zero_point,
                                 packed_ptr, start_col, end_col);
       }
+#else
+      // Encode each of src_inc0, ..., src_inc3 in lowest 4 bits of src_inc
+      // to save on registers (we have fewer general purpose registers in
+      // 32-bit ARM than in 64-bit ARM). For the 64-bit case, we pass four
+      // values that are each either 16 or 0 and use them directly. For the
+      // 32-bit case, bits 0, 1, 2, and 3 are used to determine if we should
+      // use the value 16 (bit is set) or 0 (bit is not set) for the
+      // respective increment value.
+      std::int64_t src_inc = 0;
+      src_inc += src_inc0 == 16 ? 1 : 0;
+      src_inc += src_inc1 == 16 ? 2 : 0;
+      src_inc += src_inc2 == 16 ? 4 : 0;
+      src_inc += src_inc3 == 16 ? 8 : 0;
+      const int kOutputStride = 32;
+      PackFloatNeonOutOfOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc,
+                              src_matrix.layout.rows, src_matrix.zero_point,
+                              packed_ptr, start_col, end_col, kOutputStride);
+#endif  // RUY_PLATFORM(NEON_64)
     }
   }
 };
 
-#endif  // RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#if RUY_PLATFORM(NEON_32)
+// The 32-bit float kernel is 8 rows X 4 columns, so we need an additional
+// specialization for a FixedKernelLayout with 4 columns.
+template <>
+struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kRowMajor, 1, 4>, float,
+                float, float> {
+  static void Run(Tuning tuning, const Matrix<float>& src_matrix,
+                  PackedMatrix<float>* packed_matrix, int start_col,
+                  int end_col) {
+    RUY_DCHECK(IsColMajor(src_matrix.layout));
+    RUY_DCHECK(IsColMajor(packed_matrix->layout));
+    RUY_DCHECK_EQ(start_col % 4, 0);
+    const float zerobuf[4] = {0};
+    for (int block_col = start_col; block_col < end_col; block_col += 4) {
+      int src_stride = src_matrix.layout.stride;
+      const float* src_ptr0 = src_matrix.data.get() + src_stride * block_col;
+      const float* src_ptr1 = src_ptr0 + src_stride;
+      const float* src_ptr2 = src_ptr1 + src_stride;
+      const float* src_ptr3 = src_ptr2 + src_stride;
+      std::int64_t src_inc0 = 16;
+      std::int64_t src_inc1 = 16;
+      std::int64_t src_inc2 = 16;
+      std::int64_t src_inc3 = 16;
+      if (block_col >= src_matrix.layout.cols - 3) {
+        if (block_col >= src_matrix.layout.cols - 0) {
+          src_ptr0 = zerobuf;
+          src_inc0 = 0;
+        }
+        if (block_col >= src_matrix.layout.cols - 1) {
+          src_ptr1 = zerobuf;
+          src_inc1 = 0;
+        }
+        if (block_col >= src_matrix.layout.cols - 2) {
+          src_ptr2 = zerobuf;
+          src_inc2 = 0;
+        }
+        if (block_col >= src_matrix.layout.cols - 3) {
+          src_ptr3 = zerobuf;
+          src_inc3 = 0;
+        }
+      }
+      float* packed_ptr =
+          packed_matrix->data + packed_matrix->layout.stride * (block_col);
+      // Encode each of src_inc0, ..., src_inc1 in lowest 4 bits of scrc_inc
+      // to save registers.
+      std::int64_t src_inc = 0;
+      src_inc += src_inc0 == 16 ? 1 : 0;
+      src_inc += src_inc1 == 16 ? 2 : 0;
+      src_inc += src_inc2 == 16 ? 4 : 0;
+      src_inc += src_inc3 == 16 ? 8 : 0;
+      const int kOutputStride = 16;
+      PackFloatNeonOutOfOrder(src_ptr0, src_ptr1, src_ptr2, src_ptr3, src_inc,
+                              src_matrix.layout.rows, src_matrix.zero_point,
+                              packed_ptr, start_col, end_col, kOutputStride);
+    }
+  }
+};
+#endif  // (RUY_PLATFORM(NEON_32))
+#endif  // (RUY_PLATFORM(NEON_64) || RUY_PLATFORM(NEON_32)) && \
+        // RUY_OPT_ENABLED(RUY_OPT_ASM)
 
 // Main entry point for packing.
 template <Path ThePath, typename FixedKernelLayout, typename Scalar,