Align directives for branch targets
- alignment helps some older CPUs and doesnt hurt new ones
- Renumber labels on branches
- sort build file and test order
PiperOrigin-RevId: 452104994
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7d5ee8a..f68b9c2 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6396,8 +6396,6 @@
src/x8-lut/gen/lut-avx512skx-vpshufb-x256.c)
SET(AARCH32_ASM_MICROKERNEL_SRCS
- src/qc8-dwconv/up8x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S
- src/qc8-dwconv/up16x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S
src/f32-gemm/gen/4x8-minmax-aarch32-neon-cortex-a7.S
src/f32-gemm/gen/4x8-minmax-aarch32-neon-cortex-a53.S
src/f32-gemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S
@@ -6414,6 +6412,8 @@
src/f32-igemm/gen/4x8-minmax-aarch32-neon-prfm-cortex-a53.S
src/f32-igemm/gen/4x8-minmax-aarch32-neon-prfm-cortex-a75.S
src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a55.S
+ src/qc8-dwconv/up8x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S
+ src/qc8-dwconv/up16x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S
src/qc8-gemm/gen/1x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a7.S
src/qc8-gemm/gen/1x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a7.S
src/qc8-gemm/gen/1x8-minmax-fp32-aarch32-neonv8-mlal-lane-cortex-a35.S
diff --git a/src/qc8-dwconv/up16x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S b/src/qc8-dwconv/up16x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S
index 66ca3fe..9a65222 100644
--- a/src/qc8-dwconv/up16x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S
+++ b/src/qc8-dwconv/up16x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S
@@ -60,6 +60,7 @@
VDUP.8 q11, d20[3] // output_max
VDUP.16 q10, d20[0] // output_zero_point
+ .p2align 3
0:
LDR r11, [sp, 100] // input_offset
LDMIB r2, {r5, r6} // i0, i1
@@ -86,21 +87,22 @@
// r8 i2
// q12 q13 q14 q15 accumulators
+ .p2align 3
1:
- ADD r7, lr, 64 // skip over bias to get weights
- VLD1.8 {q4}, [r8]! // i2
- VLD1.8 {q12}, [r7]! // w0
- VLD1.8 {q5}, [r5]! // i0
- VLD1.8 {q13}, [r7]! // w1
- VLD1.8 {q6}, [r6]! // i1
- VLD1.8 {q14}, [r7] // w2
+ ADD r7, lr, 64 // skip over bias to get weights
+ VLD1.8 {q4}, [r8]! // i2
+ VLD1.8 {q12}, [r7]! // w0
+ VLD1.8 {q5}, [r5]! // i0
+ VLD1.8 {q13}, [r7]! // w1
+ VLD1.8 {q6}, [r6]! // i1
+ VLD1.8 {q14}, [r7] // w2
- VMULL.S8 q1, d8, d24 // i2 * w0
+ VMULL.S8 q1, d8, d24 // i2 * w0
VMULL.S8 q2, d9, d25
- VMLAL.S8 q1, d10, d26 // i0 * w1
+ VMLAL.S8 q1, d10, d26 // i0 * w1
VMLAL.S8 q2, d11, d27
- VMULL.S8 q0, d12, d28 // i1 * w2
- VLD1.8 {q12, q13}, [lr]! // load bias
+ VMULL.S8 q0, d12, d28 // i1 * w2
+ VLD1.8 {q12, q13}, [lr]! // load bias
VMULL.S8 q3, d13, d29
VLD1.8 {q14, q15}, [lr], r9
@@ -111,7 +113,7 @@
VADDW.S16 q12, q12, d2
VADDW.S16 q13, q13, d3
VADDW.S16 q14, q14, d6
- VLD1.32 {q0, q1}, [lr]! // quant per channel scale values
+ VLD1.32 {q0, q1}, [lr]! // quant per channel scale values
VADDW.S16 q15, q15, d7
VLD1.32 {q2, q3}, [lr]!
@@ -153,42 +155,42 @@
BNE 4f
3:
- LDR r6, [sp, 92] // input_stride
- SUBS r1, r1, 1 // output_width
- ADD r10, r10, r12 // output += output_increment
- ADD r2, r2, r6 // input += input_stride
+ LDR r6, [sp, 92] // input_stride
+ SUBS r1, r1, 1 // output_width
+ ADD r10, r10, r12 // output += output_increment
+ ADD r2, r2, r6 // input += input_stride
BNE 0b
VPOP {d8, d9, d10, d11, d12, d13}
- ADD sp, sp, 4 // pad
+ ADD sp, sp, 4 // pad
POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
// Small Remainder - 1-8 channels
4:
- CMP r11, 9 // handle 9 or more
- ADD r7, lr, 64 // skip over bias to get weights
+ CMP r11, 9 // handle 9 or more
+ ADD r7, lr, 64 // skip over bias to get weights
BHS 5f
MOV r9, 16
- VLD1.8 {d8}, [r8] // i2
- VLD1.8 {d24}, [r7], r9 // w0
- VLD1.8 {d10}, [r5] // i0
- VLD1.8 {d26}, [r7], r9 // w1
- VLD1.8 {d12}, [r6] // i1
- VLD1.8 {d28}, [r7] // w2
+ VLD1.8 {d8}, [r8] // i2
+ VLD1.8 {d24}, [r7], r9 // w0
+ VLD1.8 {d10}, [r5] // i0
+ VLD1.8 {d26}, [r7], r9 // w1
+ VLD1.8 {d12}, [r6] // i1
+ VLD1.8 {d28}, [r7] // w2
- VMULL.S8 q1, d8, d24 // i2 * w0
- VMLAL.S8 q1, d10, d26 // i0 * w1
- VMULL.S8 q0, d12, d28 // i1 * w2
- VLD1.8 {q12, q13}, [lr] // load bias
+ VMULL.S8 q1, d8, d24 // i2 * w0
+ VMLAL.S8 q1, d10, d26 // i0 * w1
+ VMULL.S8 q0, d12, d28 // i1 * w2
+ VLD1.8 {q12, q13}, [lr] // load bias
ADD lr, lr, 112
VADDW.S16 q12, q12, d0
VADDW.S16 q13, q13, d1
VADDW.S16 q12, q12, d2
VADDW.S16 q13, q13, d3
- VLD1.32 {q0, q1}, [lr] // quant per channel scale values
+ VLD1.32 {q0, q1}, [lr] // quant per channel scale values
// QC8 FP32 quantization
@@ -211,27 +213,28 @@
// Store 8
TST r11, 8
- BEQ 7f
+ BEQ 6f
VST1.8 {d24}, [r10]!
B 3b
+ .p2align 3
// Large Remainder - 9-15 channels
// Process 16 same as main loop, but conditional store
5:
- VLD1.8 {q4}, [r8]! // i2
- VLD1.8 {q12}, [r7]! // w0
- VLD1.8 {q5}, [r5]! // i0
- VLD1.8 {q13}, [r7]! // w1
- VLD1.8 {q6}, [r6]! // i1
- VLD1.8 {q14}, [r7] // w2
+ VLD1.8 {q4}, [r8]! // i2
+ VLD1.8 {q12}, [r7]! // w0
+ VLD1.8 {q5}, [r5]! // i0
+ VLD1.8 {q13}, [r7]! // w1
+ VLD1.8 {q6}, [r6]! // i1
+ VLD1.8 {q14}, [r7] // w2
- VMULL.S8 q1, d8, d24 // i2 * w0
+ VMULL.S8 q1, d8, d24 // i2 * w0
VMULL.S8 q2, d9, d25
- VMLAL.S8 q1, d10, d26 // i0 * w1
+ VMLAL.S8 q1, d10, d26 // i0 * w1
VMLAL.S8 q2, d11, d27
- VMULL.S8 q0, d12, d28 // i1 * w2
- VLD1.8 {q12, q13}, [lr]! // load bias
+ VMULL.S8 q0, d12, d28 // i1 * w2
+ VLD1.8 {q12, q13}, [lr]! // load bias
VMULL.S8 q3, d13, d29
VLD1.8 {q14, q15}, [lr], r9
@@ -242,7 +245,7 @@
VADDW.S16 q12, q12, d2
VADDW.S16 q13, q13, d3
VADDW.S16 q14, q14, d6
- VLD1.32 {q0, q1}, [lr]! // quant per channel scale values
+ VLD1.32 {q0, q1}, [lr]! // quant per channel scale values
VADDW.S16 q15, q15, d7
VLD1.32 {q2, q3}, [lr]
@@ -280,21 +283,21 @@
VMOV d24, d25
// Store 4
-7:
+6:
TST r11, 4
- BEQ 8f
+ BEQ 7f
VST1.32 {d24[0]}, [r10]!
VEXT.8 d24, d24, d24, 4
// Store 2
-8:
+7:
TST r11, 2
- BEQ 9f
+ BEQ 8f
VST1.16 {d24[0]}, [r10]!
VEXT.8 d24, d24, d24, 2
// Store 1
-9:
+8:
TST r11, 1
BEQ 3b
VST1.8 {d24[0]}, [r10]!
diff --git a/src/qc8-dwconv/up8x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S b/src/qc8-dwconv/up8x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S
index 5bd1bd4..0cc5be3 100644
--- a/src/qc8-dwconv/up8x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S
+++ b/src/qc8-dwconv/up8x3-minmax-fp32-aarch32-neonv8-mla8-cortex-a35.S
@@ -57,6 +57,7 @@
VDUP.8 d19, d20[3] // output_max
VDUP.16 q10, d20[0] // output_zero_point
+ .p2align 3
0:
LDMIB r2, {r5, r6} // i0, i1
LDR r8, [r2] // i2
@@ -68,7 +69,7 @@
ADDNE r8, r8, r7 // i2 += input_offset
MOV lr, r3
- MOV r11, r0 // channel count as is, fall into loop
+ MOV r11, r0 // channel count as is, fall into loop
// Main loop - 8 channels
// lr weights. r3 reset
@@ -83,19 +84,20 @@
// 32 quant scale - 8 int
// 88 bytes total
+ .p2align 3
1:
- VLD1.8 {q12, q13}, [lr]! // load bias
+ VLD1.8 {q12, q13}, [lr]! // load bias
- VLD1.8 {d4}, [r8]! // i2
- VLD1.8 {d7}, [lr]! // w0
- VLD1.8 {d5}, [r5]! // i0
- VLD1.8 {d16}, [lr]! // w1
- VLD1.8 {d6}, [r6]! // i1
- VLD1.8 {d17}, [lr]! // w2
+ VLD1.8 {d4}, [r8]! // i2
+ VLD1.8 {d7}, [lr]! // w0
+ VLD1.8 {d5}, [r5]! // i0
+ VLD1.8 {d16}, [lr]! // w1
+ VLD1.8 {d6}, [r6]! // i1
+ VLD1.8 {d17}, [lr]! // w2
- VMULL.S8 q1, d4, d7 // i2 * w0
- VMLAL.S8 q1, d5, d16 // i0 * w1
- VMULL.S8 q0, d6, d17 // i1 * w2
+ VMULL.S8 q1, d4, d7 // i2 * w0
+ VMLAL.S8 q1, d5, d16 // i0 * w1
+ VMULL.S8 q0, d6, d17 // i1 * w2
VADDW.S16 q12, q12, d0
@@ -103,7 +105,7 @@
VADDW.S16 q12, q12, d2
VADDW.S16 q13, q13, d3
- VLD1.32 {q0, q1}, [lr]! // quant per channel scale values
+ VLD1.32 {q0, q1}, [lr]! // quant per channel scale values
// QC8 FP32 quantization
@@ -118,27 +120,28 @@
VQMOVN.S32 d24, q12
VQMOVN.S32 d25, q13
- SUBS r11, r11, 8 // 8 channels per loop
+ SUBS r11, r11, 8 // 8 channels per loop
VQADD.S16 q12, q12, q10
VQMOVN.S16 d24, q12
VMIN.S8 d24, d24, d19
VMAX.S8 d24, d24, d18
- BLO 3f // less than 8?
+ BLO 3f // less than 8?
VST1.8 {d24}, [r10]!
- BHI 1b // at least 1, continue loop
+ BHI 1b // at least 1, continue loop
2:
- SUBS r1, r1, 1 // output_width
- ADD r10, r10, r12 // output += output_increment
- ADD r2, r2, r9 // input += input_stride
+ SUBS r1, r1, 1 // output_width
+ ADD r10, r10, r12 // output += output_increment
+ ADD r2, r2, r9 // input += input_stride
BNE 0b
ADD sp, sp, 4
POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ .p2align 3
// Store 4
3:
TST r11, 4
@@ -164,4 +167,4 @@
END_FUNCTION xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35
#ifdef __ELF__
.section ".note.GNU-stack","",%progbits
-#endif
\ No newline at end of file
+#endif
diff --git a/test/qc8-dwconv-minmax-fp32.cc b/test/qc8-dwconv-minmax-fp32.cc
index 7adf37a..0ad34a2 100644
--- a/test/qc8-dwconv-minmax-fp32.cc
+++ b/test/qc8-dwconv-minmax-fp32.cc
@@ -20,6 +20,192 @@
#include "dwconv-microkernel-tester.h"
+#if XNN_ARCH_ARM
+ TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_eq_8) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(3)
+ .channels(8)
+ .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_div_8) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(3)
+ .channels(channels)
+ .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_div_8_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(3)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_div_8_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(3)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_lt_8) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t channels = 1; channels < 8; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(3)
+ .channels(channels)
+ .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_gt_8) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(3)
+ .channels(channels)
+ .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_gt_8_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(3)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_gt_8_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(3)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, multipixel) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(3)
+ .channels(channels)
+ .width(3)
+ .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, multipixel_with_step) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ for (size_t step = 2; step <= 3; step++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(3)
+ .channels(channels)
+ .width(3)
+ .step(step)
+ .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, multipixel_with_output_stride) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(3)
+ .channels(8)
+ .width(5)
+ .output_stride(43)
+ .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, multipixel_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(3)
+ .channels(channels)
+ .width(3)
+ .qmin(128)
+ .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, multipixel_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(3)
+ .channels(channels)
+ .width(3)
+ .qmax(128)
+ .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, input_offset) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(3)
+ .channels(channels)
+ .input_offset(176)
+ .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, zero) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t mz = 0; mz < 3; mz++) {
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(3)
+ .channels(channels)
+ .input_offset(176)
+ .zero_index(mz)
+ .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+#endif // XNN_ARCH_ARM
+
+
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__NEON_MLA8_LD64, c_eq_8) {
TEST_REQUIRES_ARM_NEON;
@@ -2625,192 +2811,6 @@
#if XNN_ARCH_ARM
- TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_eq_8) {
- TEST_REQUIRES_ARM_NEON_V8;
- DWConvMicrokernelTester()
- .cr(8)
- .kr(3)
- .channels(8)
- .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
- }
-
- TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_div_8) {
- TEST_REQUIRES_ARM_NEON_V8;
- for (uint32_t channels = 16; channels < 128; channels += 24) {
- DWConvMicrokernelTester()
- .cr(8)
- .kr(3)
- .channels(channels)
- .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
- }
- }
-
- TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_div_8_with_qmin) {
- TEST_REQUIRES_ARM_NEON_V8;
- for (uint32_t channels = 16; channels < 128; channels += 24) {
- DWConvMicrokernelTester()
- .cr(8)
- .kr(3)
- .channels(channels)
- .qmin(128)
- .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
- }
- }
-
- TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_div_8_with_qmax) {
- TEST_REQUIRES_ARM_NEON_V8;
- for (uint32_t channels = 16; channels < 128; channels += 24) {
- DWConvMicrokernelTester()
- .cr(8)
- .kr(3)
- .channels(channels)
- .qmax(128)
- .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
- }
- }
-
- TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_lt_8) {
- TEST_REQUIRES_ARM_NEON_V8;
- for (uint32_t channels = 1; channels < 8; channels++) {
- DWConvMicrokernelTester()
- .cr(8)
- .kr(3)
- .channels(channels)
- .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
- }
- }
-
- TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_gt_8) {
- TEST_REQUIRES_ARM_NEON_V8;
- for (uint32_t channels = 9; channels < 16; channels++) {
- DWConvMicrokernelTester()
- .cr(8)
- .kr(3)
- .channels(channels)
- .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
- }
- }
-
- TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_gt_8_with_qmin) {
- TEST_REQUIRES_ARM_NEON_V8;
- for (uint32_t channels = 9; channels < 16; channels++) {
- DWConvMicrokernelTester()
- .cr(8)
- .kr(3)
- .channels(channels)
- .qmin(128)
- .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
- }
- }
-
- TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_gt_8_with_qmax) {
- TEST_REQUIRES_ARM_NEON_V8;
- for (uint32_t channels = 9; channels < 16; channels++) {
- DWConvMicrokernelTester()
- .cr(8)
- .kr(3)
- .channels(channels)
- .qmax(128)
- .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
- }
- }
-
- TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, multipixel) {
- TEST_REQUIRES_ARM_NEON_V8;
- for (size_t channels = 1; channels <= 40; channels += 7) {
- DWConvMicrokernelTester()
- .cr(8)
- .kr(3)
- .channels(channels)
- .width(3)
- .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
- }
- }
-
- TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, multipixel_with_step) {
- TEST_REQUIRES_ARM_NEON_V8;
- for (size_t channels = 1; channels <= 40; channels += 7) {
- for (size_t step = 2; step <= 3; step++) {
- DWConvMicrokernelTester()
- .cr(8)
- .kr(3)
- .channels(channels)
- .width(3)
- .step(step)
- .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
- }
- }
- }
-
- TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, multipixel_with_output_stride) {
- TEST_REQUIRES_ARM_NEON_V8;
- for (size_t channels = 1; channels <= 40; channels += 7) {
- DWConvMicrokernelTester()
- .cr(8)
- .kr(3)
- .channels(8)
- .width(5)
- .output_stride(43)
- .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
- }
- }
-
- TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, multipixel_with_qmin) {
- TEST_REQUIRES_ARM_NEON_V8;
- for (size_t channels = 1; channels <= 40; channels += 7) {
- DWConvMicrokernelTester()
- .cr(8)
- .kr(3)
- .channels(channels)
- .width(3)
- .qmin(128)
- .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
- }
- }
-
- TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, multipixel_with_qmax) {
- TEST_REQUIRES_ARM_NEON_V8;
- for (size_t channels = 1; channels <= 40; channels += 7) {
- DWConvMicrokernelTester()
- .cr(8)
- .kr(3)
- .channels(channels)
- .width(3)
- .qmax(128)
- .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
- }
- }
-
- TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, input_offset) {
- TEST_REQUIRES_ARM_NEON_V8;
- for (uint32_t channels = 16; channels < 128; channels += 24) {
- DWConvMicrokernelTester()
- .cr(8)
- .kr(3)
- .channels(channels)
- .input_offset(176)
- .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
- }
- }
-
- TEST(QC8_DWCONV_MINMAX_FP32_UP8X3__AARCH32_NEONV8_MLA8_CORTEX_A35, zero) {
- TEST_REQUIRES_ARM_NEON_V8;
- for (uint32_t mz = 0; mz < 3; mz++) {
- for (uint32_t channels = 16; channels < 128; channels += 24) {
- DWConvMicrokernelTester()
- .cr(8)
- .kr(3)
- .channels(channels)
- .input_offset(176)
- .zero_index(mz)
- .Test(xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35, xnn_init_qs8_minmax_neonv8_params, xnn_qs8_requantize_fp32);
- }
- }
- }
-#endif // XNN_ARCH_ARM
-
-
-#if XNN_ARCH_ARM
TEST(QC8_DWCONV_MINMAX_FP32_UP16X3__AARCH32_NEONV8_MLA8_CORTEX_A35, c_eq_16) {
TEST_REQUIRES_ARM_NEON_V8;
DWConvMicrokernelTester()
diff --git a/test/qc8-dwconv-minmax-fp32.yaml b/test/qc8-dwconv-minmax-fp32.yaml
index 9297138..2ddf45d 100644
--- a/test/qc8-dwconv-minmax-fp32.yaml
+++ b/test/qc8-dwconv-minmax-fp32.yaml
@@ -4,6 +4,8 @@
# LICENSE file in the root directory of this source tree.
# ARM NEON
+- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35
+ init: xnn_init_qs8_minmax_neonv8_params
- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__neon_mla8_ld64
init: xnn_init_qs8_minmax_neon_params
- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__neonv8_mla8_ld64
@@ -32,8 +34,6 @@
init: xnn_init_qs8_minmax_neonv8_params
- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mul16
init: xnn_init_qs8_minmax_neonv8_params
-- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__aarch32_neonv8_mla8_cortex_a35
- init: xnn_init_qs8_minmax_neonv8_params
- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__aarch32_neonv8_mla8_cortex_a35
init: xnn_init_qs8_minmax_neonv8_params
- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__neon_mla8_ld64