Align directives for branch targets

- alignment helps some older CPUs and doesnt hurt new ones
- Renumber labels on branches
-  sort build file and test order

PiperOrigin-RevId: 452104994
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7d5ee8a..f68b9c2 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6396,8 +6396,6 @@
   src/x8-lut/gen/lut-avx512skx-vpshufb-x256.c)
 
 SET(AARCH32_ASM_MICROKERNEL_SRCS
-  src/qc8-dwconv/up8x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S
-  src/qc8-dwconv/up16x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S
   src/f32-gemm/gen/4x8-minmax-aarch32-neon-cortex-a7.S
   src/f32-gemm/gen/4x8-minmax-aarch32-neon-cortex-a53.S
   src/f32-gemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S
@@ -6414,6 +6412,8 @@
   src/f32-igemm/gen/4x8-minmax-aarch32-neon-prfm-cortex-a53.S
   src/f32-igemm/gen/4x8-minmax-aarch32-neon-prfm-cortex-a75.S
   src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a55.S
+  src/qc8-dwconv/up8x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S
+  src/qc8-dwconv/up16x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S
   src/qc8-gemm/gen/1x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a7.S
   src/qc8-gemm/gen/1x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a7.S
   src/qc8-gemm/gen/1x8-minmax-fp32-aarch32-neonv8-mlal-lane-cortex-a35.S
diff --git a/src/qc8-dwconv/up16x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S b/src/qc8-dwconv/up16x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S
index 66ca3fe..9a65222 100644
--- a/src/qc8-dwconv/up16x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S
+++ b/src/qc8-dwconv/up16x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S
@@ -60,6 +60,7 @@
         VDUP.8  q11, d20[3]              // output_max
         VDUP.16 q10, d20[0]              // output_zero_point
 
+        .p2align 3
 0:
         LDR     r11, [sp, 100]           // input_offset
         LDMIB   r2, {r5, r6}             // i0, i1
@@ -86,21 +87,22 @@
 // r8 i2
 // q12 q13 q14 q15   accumulators
 
+        .p2align 3
 1:
-        ADD     r7, lr, 64         // skip over bias to get weights
-        VLD1.8  {q4}, [r8]!        // i2
-        VLD1.8  {q12}, [r7]!       // w0
-        VLD1.8  {q5}, [r5]!        // i0
-        VLD1.8  {q13}, [r7]!       // w1
-        VLD1.8  {q6}, [r6]!        // i1
-        VLD1.8  {q14}, [r7]        // w2
+        ADD     r7, lr, 64              // skip over bias to get weights
+        VLD1.8  {q4}, [r8]!             // i2
+        VLD1.8  {q12}, [r7]!            // w0
+        VLD1.8  {q5}, [r5]!             // i0
+        VLD1.8  {q13}, [r7]!            // w1
+        VLD1.8  {q6}, [r6]!             // i1
+        VLD1.8  {q14}, [r7]             // w2
 
-        VMULL.S8 q1, d8,  d24      // i2 * w0
+        VMULL.S8 q1, d8,  d24           // i2 * w0
         VMULL.S8 q2, d9,  d25
-        VMLAL.S8 q1, d10, d26      // i0 * w1
+        VMLAL.S8 q1, d10, d26           // i0 * w1
         VMLAL.S8 q2, d11, d27
-        VMULL.S8 q0, d12, d28      // i1 * w2
-        VLD1.8  {q12, q13}, [lr]!  // load bias
+        VMULL.S8 q0, d12, d28           // i1 * w2
+        VLD1.8  {q12, q13}, [lr]!       // load bias
         VMULL.S8 q3, d13, d29
         VLD1.8  {q14, q15}, [lr], r9
 
@@ -111,7 +113,7 @@
         VADDW.S16 q12, q12, d2
         VADDW.S16 q13, q13, d3
         VADDW.S16 q14, q14, d6
-        VLD1.32 {q0, q1}, [lr]!    // quant per channel scale values
+        VLD1.32 {q0, q1}, [lr]!         // quant per channel scale values
         VADDW.S16 q15, q15, d7
         VLD1.32 {q2, q3}, [lr]!
 
@@ -153,42 +155,42 @@
         BNE     4f
 
 3:
-        LDR     r6, [sp, 92]      // input_stride
-        SUBS    r1, r1, 1         // output_width
-        ADD     r10, r10, r12     // output += output_increment
-        ADD     r2, r2, r6        // input += input_stride
+        LDR     r6, [sp, 92]            // input_stride
+        SUBS    r1, r1, 1               // output_width
+        ADD     r10, r10, r12           // output += output_increment
+        ADD     r2, r2, r6              // input += input_stride
         BNE     0b
 
         VPOP    {d8, d9, d10, d11, d12, d13}
-        ADD     sp, sp, 4         // pad
+        ADD     sp, sp, 4               // pad
         POP     {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 
 // Small Remainder - 1-8 channels
 4:
-        CMP     r11, 9            // handle 9 or more
-        ADD     r7, lr, 64        // skip over bias to get weights
+        CMP     r11, 9                  // handle 9 or more
+        ADD     r7, lr, 64              // skip over bias to get weights
         BHS     5f
 
         MOV     r9, 16
 
-        VLD1.8  {d8}, [r8]         // i2
-        VLD1.8  {d24}, [r7], r9    // w0
-        VLD1.8  {d10}, [r5]        // i0
-        VLD1.8  {d26}, [r7], r9    // w1
-        VLD1.8  {d12}, [r6]        // i1
-        VLD1.8  {d28}, [r7]        // w2
+        VLD1.8  {d8}, [r8]              // i2
+        VLD1.8  {d24}, [r7], r9         // w0
+        VLD1.8  {d10}, [r5]             // i0
+        VLD1.8  {d26}, [r7], r9         // w1
+        VLD1.8  {d12}, [r6]             // i1
+        VLD1.8  {d28}, [r7]             // w2
 
-        VMULL.S8 q1, d8,  d24      // i2 * w0
-        VMLAL.S8 q1, d10, d26      // i0 * w1
-        VMULL.S8 q0, d12, d28      // i1 * w2
-        VLD1.8  {q12, q13}, [lr]   // load bias
+        VMULL.S8 q1, d8,  d24           // i2 * w0
+        VMLAL.S8 q1, d10, d26           // i0 * w1
+        VMULL.S8 q0, d12, d28           // i1 * w2
+        VLD1.8  {q12, q13}, [lr]        // load bias
         ADD     lr, lr, 112
 
         VADDW.S16 q12, q12, d0
         VADDW.S16 q13, q13, d1
         VADDW.S16 q12, q12, d2
         VADDW.S16 q13, q13, d3
-        VLD1.32 {q0, q1}, [lr]    // quant per channel scale values
+        VLD1.32 {q0, q1}, [lr]          // quant per channel scale values
 
         // QC8 FP32 quantization
 
@@ -211,27 +213,28 @@
 
         //  Store 8
         TST     r11, 8
-        BEQ     7f
+        BEQ     6f
         VST1.8  {d24}, [r10]!
         B       3b
 
+        .p2align 3
 // Large Remainder - 9-15 channels
 // Process 16 same as main loop, but conditional store
 
 5:
-        VLD1.8  {q4}, [r8]!        // i2
-        VLD1.8  {q12}, [r7]!       // w0
-        VLD1.8  {q5}, [r5]!        // i0
-        VLD1.8  {q13}, [r7]!       // w1
-        VLD1.8  {q6}, [r6]!        // i1
-        VLD1.8  {q14}, [r7]        // w2
+        VLD1.8  {q4}, [r8]!             // i2
+        VLD1.8  {q12}, [r7]!            // w0
+        VLD1.8  {q5}, [r5]!             // i0
+        VLD1.8  {q13}, [r7]!            // w1
+        VLD1.8  {q6}, [r6]!             // i1
+        VLD1.8  {q14}, [r7]             // w2
 
-        VMULL.S8 q1, d8,  d24      // i2 * w0
+        VMULL.S8 q1, d8,  d24           // i2 * w0
         VMULL.S8 q2, d9,  d25
-        VMLAL.S8 q1, d10, d26      // i0 * w1
+        VMLAL.S8 q1, d10, d26           // i0 * w1
         VMLAL.S8 q2, d11, d27
-        VMULL.S8 q0, d12, d28      // i1 * w2
-        VLD1.8  {q12, q13}, [lr]!  // load bias
+        VMULL.S8 q0, d12, d28           // i1 * w2
+        VLD1.8  {q12, q13}, [lr]!       // load bias
         VMULL.S8 q3, d13, d29
         VLD1.8  {q14, q15}, [lr], r9
 
@@ -242,7 +245,7 @@
         VADDW.S16 q12, q12, d2
         VADDW.S16 q13, q13, d3
         VADDW.S16 q14, q14, d6
-        VLD1.32 {q0, q1}, [lr]!    // quant per channel scale values
+        VLD1.32 {q0, q1}, [lr]!         // quant per channel scale values
         VADDW.S16 q15, q15, d7
         VLD1.32 {q2, q3}, [lr]
 
@@ -280,21 +283,21 @@
         VMOV    d24, d25
 
         // Store 4
-7:
+6:
         TST     r11, 4
-        BEQ     8f
+        BEQ     7f
         VST1.32 {d24[0]}, [r10]!
         VEXT.8  d24, d24, d24, 4
 
         // Store 2
-8:
+7:
         TST     r11, 2
-        BEQ     9f
+        BEQ     8f
         VST1.16 {d24[0]}, [r10]!
         VEXT.8  d24, d24, d24, 2
 
         // Store 1
-9:
+8:
         TST     r11, 1
         BEQ     3b
         VST1.8  {d24[0]}, [r10]!
diff --git a/src/qc8-dwconv/up8x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S b/src/qc8-dwconv/up8x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S
index 5bd1bd4..0cc5be3 100644
--- a/src/qc8-dwconv/up8x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S
+++ b/src/qc8-dwconv/up8x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S
@@ -57,6 +57,7 @@
         VDUP.8  d19, d20[3]              // output_max
         VDUP.16 q10, d20[0]              // output_zero_point
 
+        .p2align 3
 0:
         LDMIB   r2, {r5, r6}             // i0, i1
         LDR     r8, [r2]                 // i2
@@ -68,7 +69,7 @@
         ADDNE   r8, r8, r7               // i2 += input_offset
 
         MOV     lr, r3
-        MOV     r11, r0  // channel count as is, fall into loop
+        MOV     r11, r0                 // channel count as is, fall into loop
 
 // Main loop - 8 channels
 // lr weights.  r3 reset
@@ -83,19 +84,20 @@
 //   32 quant scale - 8 int
 //   88 bytes total
 
+        .p2align 3
 1:
-        VLD1.8  {q12, q13}, [lr]!  // load bias
+        VLD1.8  {q12, q13}, [lr]!       // load bias
 
-        VLD1.8  {d4}, [r8]!        // i2
-        VLD1.8  {d7}, [lr]!        // w0
-        VLD1.8  {d5}, [r5]!        // i0
-        VLD1.8  {d16}, [lr]!       // w1
-        VLD1.8  {d6}, [r6]!        // i1
-        VLD1.8  {d17}, [lr]!       // w2
+        VLD1.8  {d4}, [r8]!             // i2
+        VLD1.8  {d7}, [lr]!             // w0
+        VLD1.8  {d5}, [r5]!             // i0
+        VLD1.8  {d16}, [lr]!            // w1
+        VLD1.8  {d6}, [r6]!             // i1
+        VLD1.8  {d17}, [lr]!            // w2
 
-        VMULL.S8 q1, d4, d7        // i2 * w0
-        VMLAL.S8 q1, d5, d16       // i0 * w1
-        VMULL.S8 q0, d6, d17       // i1 * w2
+        VMULL.S8 q1, d4, d7             // i2 * w0
+        VMLAL.S8 q1, d5, d16            // i0 * w1
+        VMULL.S8 q0, d6, d17            // i1 * w2
 
 
         VADDW.S16 q12, q12, d0
@@ -103,7 +105,7 @@
         VADDW.S16 q12, q12, d2
         VADDW.S16 q13, q13, d3
 
-        VLD1.32 {q0, q1}, [lr]!    // quant per channel scale values
+        VLD1.32 {q0, q1}, [lr]!         // quant per channel scale values
 
         // QC8 FP32 quantization
 
@@ -118,27 +120,28 @@
 
         VQMOVN.S32 d24, q12
         VQMOVN.S32 d25, q13
-        SUBS    r11, r11, 8     // 8 channels per loop
+        SUBS    r11, r11, 8             // 8 channels per loop
 
         VQADD.S16 q12, q12, q10
         VQMOVN.S16 d24, q12
         VMIN.S8 d24, d24, d19
         VMAX.S8 d24, d24, d18
 
-        BLO     3f              // less than 8?
+        BLO     3f                      // less than 8?
 
         VST1.8  {d24}, [r10]!
-        BHI     1b              // at least 1, continue loop
+        BHI     1b                      // at least 1, continue loop
 
 2:
-        SUBS    r1, r1, 1         // output_width
-        ADD     r10, r10, r12     // output += output_increment
-        ADD     r2, r2, r9        // input += input_stride
+        SUBS    r1, r1, 1               // output_width
+        ADD     r10, r10, r12           // output += output_increment
+        ADD     r2, r2, r9              // input += input_stride
         BNE     0b
 
         ADD     sp, sp, 4
         POP     {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 
+        .p2align 3
         // Store 4
 3:
         TST     r11, 4
@@ -164,4 +167,4 @@
 END_FUNCTION xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35
 #ifdef __ELF__
 .section ".note.GNU-stack","",%progbits
-#endif
\ No newline at end of file
+#endif
diff --git a/test/qc8-dwconv-minmax-fp32.cc b/test/qc8-dwconv-minmax-fp32.cc
index 7adf37a..0ad34a2 100644
--- a/test/qc8-dwconv-minmax-fp32.cc
+++ b/test/qc8-dwconv-minmax-fp32.cc
@@ -20,6 +20,192 @@
 #include "dwconv-microkernel-tester.h"
 
 
+#if XNN_ARCH_ARM
+  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_eq_8) {
+    TEST_REQUIRES_ARM_NEON_V8;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(3)
+      .channels(8)
+      .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_div_8) {
+    TEST_REQUIRES_ARM_NEON_V8;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(3)
+        .channels(channels)
+        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_div_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_V8;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(3)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_div_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_V8;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(3)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_lt_8) {
+    TEST_REQUIRES_ARM_NEON_V8;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(3)
+        .channels(channels)
+        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_gt_8) {
+    TEST_REQUIRES_ARM_NEON_V8;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(3)
+        .channels(channels)
+        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_gt_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_V8;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(3)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_gt_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_V8;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(3)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, multipixel) {
+    TEST_REQUIRES_ARM_NEON_V8;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(3)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, multipixel_with_step) {
+    TEST_REQUIRES_ARM_NEON_V8;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 3; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(3)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, multipixel_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON_V8;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(3)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, multipixel_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_V8;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(3)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, multipixel_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_V8;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(3)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, input_offset) {
+    TEST_REQUIRES_ARM_NEON_V8;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(3)
+        .channels(channels)
+        .input_offset(176)
+        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, zero) {
+    TEST_REQUIRES_ARM_NEON_V8;
+    for (uint32_t mz = 0; mz < 3; mz++) {
+      for (uint32_t channels = 16; channels < 128; channels += 24) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(3)
+          .channels(channels)
+          .input_offset(176)
+          .zero_index(mz)
+          .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM
+
+
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__NEON_MLA8_LD64, c_eq_8) {
     TEST_REQUIRES_ARM_NEON;
@@ -2625,192 +2811,6 @@
 
 
 #if XNN_ARCH_ARM
-  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_eq_8) {
-    TEST_REQUIRES_ARM_NEON_V8;
-    DWConvMicrokernelTester()
-      .cr(8)
-      .kr(3)
-      .channels(8)
-      .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
-  }
-
-  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_div_8) {
-    TEST_REQUIRES_ARM_NEON_V8;
-    for (uint32_t channels = 16; channels < 128; channels += 24) {
-      DWConvMicrokernelTester()
-        .cr(8)
-        .kr(3)
-        .channels(channels)
-        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_div_8_with_qmin) {
-    TEST_REQUIRES_ARM_NEON_V8;
-    for (uint32_t channels = 16; channels < 128; channels += 24) {
-      DWConvMicrokernelTester()
-        .cr(8)
-        .kr(3)
-        .channels(channels)
-        .qmin(128)
-        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_div_8_with_qmax) {
-    TEST_REQUIRES_ARM_NEON_V8;
-    for (uint32_t channels = 16; channels < 128; channels += 24) {
-      DWConvMicrokernelTester()
-        .cr(8)
-        .kr(3)
-        .channels(channels)
-        .qmax(128)
-        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_lt_8) {
-    TEST_REQUIRES_ARM_NEON_V8;
-    for (uint32_t channels = 1; channels < 8; channels++) {
-      DWConvMicrokernelTester()
-        .cr(8)
-        .kr(3)
-        .channels(channels)
-        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_gt_8) {
-    TEST_REQUIRES_ARM_NEON_V8;
-    for (uint32_t channels = 9; channels < 16; channels++) {
-      DWConvMicrokernelTester()
-        .cr(8)
-        .kr(3)
-        .channels(channels)
-        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_gt_8_with_qmin) {
-    TEST_REQUIRES_ARM_NEON_V8;
-    for (uint32_t channels = 9; channels < 16; channels++) {
-      DWConvMicrokernelTester()
-        .cr(8)
-        .kr(3)
-        .channels(channels)
-        .qmin(128)
-        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_gt_8_with_qmax) {
-    TEST_REQUIRES_ARM_NEON_V8;
-    for (uint32_t channels = 9; channels < 16; channels++) {
-      DWConvMicrokernelTester()
-        .cr(8)
-        .kr(3)
-        .channels(channels)
-        .qmax(128)
-        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, multipixel) {
-    TEST_REQUIRES_ARM_NEON_V8;
-    for (size_t channels = 1; channels <= 40; channels += 7) {
-      DWConvMicrokernelTester()
-        .cr(8)
-        .kr(3)
-        .channels(channels)
-        .width(3)
-        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, multipixel_with_step) {
-    TEST_REQUIRES_ARM_NEON_V8;
-    for (size_t channels = 1; channels <= 40; channels += 7) {
-      for (size_t step = 2; step <= 3; step++) {
-        DWConvMicrokernelTester()
-          .cr(8)
-          .kr(3)
-          .channels(channels)
-          .width(3)
-          .step(step)
-          .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-
-  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, multipixel_with_output_stride) {
-    TEST_REQUIRES_ARM_NEON_V8;
-    for (size_t channels = 1; channels <= 40; channels += 7) {
-      DWConvMicrokernelTester()
-        .cr(8)
-        .kr(3)
-        .channels(8)
-        .width(5)
-        .output_stride(43)
-        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, multipixel_with_qmin) {
-    TEST_REQUIRES_ARM_NEON_V8;
-    for (size_t channels = 1; channels <= 40; channels += 7) {
-      DWConvMicrokernelTester()
-        .cr(8)
-        .kr(3)
-        .channels(channels)
-        .width(3)
-        .qmin(128)
-        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, multipixel_with_qmax) {
-    TEST_REQUIRES_ARM_NEON_V8;
-    for (size_t channels = 1; channels <= 40; channels += 7) {
-      DWConvMicrokernelTester()
-        .cr(8)
-        .kr(3)
-        .channels(channels)
-        .width(3)
-        .qmax(128)
-        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, input_offset) {
-    TEST_REQUIRES_ARM_NEON_V8;
-    for (uint32_t channels = 16; channels < 128; channels += 24) {
-      DWConvMicrokernelTester()
-        .cr(8)
-        .kr(3)
-        .channels(channels)
-        .input_offset(176)
-        .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
-    }
-  }
-
-  TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, zero) {
-    TEST_REQUIRES_ARM_NEON_V8;
-    for (uint32_t mz = 0; mz < 3; mz++) {
-      for (uint32_t channels = 16; channels < 128; channels += 24) {
-        DWConvMicrokernelTester()
-          .cr(8)
-          .kr(3)
-          .channels(channels)
-          .input_offset(176)
-          .zero_index(mz)
-          .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
-      }
-    }
-  }
-#endif  // XNN_ARCH_ARM
-
-
-#if XNN_ARCH_ARM
   TEST(QC8_DWCONV_MINMAX_FP32_UP16X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_eq_16) {
     TEST_REQUIRES_ARM_NEON_V8;
     DWConvMicrokernelTester()
diff --git a/test/qc8-dwconv-minmax-fp32.yaml b/test/qc8-dwconv-minmax-fp32.yaml
index 9297138..2ddf45d 100644
--- a/test/qc8-dwconv-minmax-fp32.yaml
+++ b/test/qc8-dwconv-minmax-fp32.yaml
@@ -4,6 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 
 # ARM NEON
+- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35
+  init: xnn_init_qs8_minmax_neonv8_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__neon_mla8_ld64
   init: xnn_init_qs8_minmax_neon_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__neonv8_mla8_ld64
@@ -32,8 +34,6 @@
   init: xnn_init_qs8_minmax_neonv8_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mul16
   init: xnn_init_qs8_minmax_neonv8_params
-- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35
-  init: xnn_init_qs8_minmax_neonv8_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__aarch32_neonv8_mla8_cortex_a35
   init: xnn_init_qs8_minmax_neonv8_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__neon_mla8_ld64