QS8 C4 Neon GEMM and E2E benchmarks
PiperOrigin-RevId: 407932393
diff --git a/bench/qs8-gemm-e2e.cc b/bench/qs8-gemm-e2e.cc
index 5219a7a..d43ea03 100644
--- a/bench/qs8-gemm-e2e.cc
+++ b/bench/qs8-gemm-e2e.cc
@@ -419,6 +419,28 @@
1 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
benchmark::utils::CheckNEON);
}
+
+ static void qs8_gemm_1x8c4__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params,
+ 1 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
+
+ static void qs8_gemm_1x16c4__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params,
+ 1 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
#endif // XNN_ENABLE_FULL_BENCHMARKS
static void qs8_gemm_2x8s4c2__neon_mull_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -509,6 +531,72 @@
benchmark::utils::CheckNEON);
}
+ static void qs8_gemm_2x8c4__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params,
+ 2 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
+
+ static void qs8_gemm_2x16c4__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params,
+ 2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
+
+ static void qs8_gemm_3x8c4__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params,
+ 3 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
+
+ static void qs8_gemm_3x16c4__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params,
+ 3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
+
+ static void qs8_gemm_4x8c4__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params,
+ 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
+
+ static void qs8_gemm_4x16c4__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params,
+ 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
+
#if XNN_ENABLE_FULL_BENCHMARKS
static void qs8_gemm_1x8c2__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
GEMMEnd2EndBenchmark(state, model,
@@ -531,6 +619,28 @@
1 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
benchmark::utils::CheckNEON);
}
+
+ static void qs8_gemm_1x8c4__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params,
+ 1 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
+
+ static void qs8_gemm_1x16c4__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params,
+ 1 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
#endif // XNN_ENABLE_FULL_BENCHMARKS
static void qs8_gemm_2x8c2__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -599,6 +709,72 @@
benchmark::utils::CheckNEON);
}
+ static void qs8_gemm_2x8c4__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params,
+ 2 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
+
+ static void qs8_gemm_2x16c4__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params,
+ 2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
+
+ static void qs8_gemm_3x8c4__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params,
+ 3 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
+
+ static void qs8_gemm_3x16c4__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params,
+ 3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
+
+ static void qs8_gemm_4x8c4__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params,
+ 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
+
+ static void qs8_gemm_4x16c4__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params,
+ 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
+
static void qs8_gemm_4x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
GEMMEnd2EndBenchmark(state, model,
xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neondot,
@@ -971,13 +1147,38 @@
BENCHMARK_QS8_END2END(qs8_gemm_4x16c16_gemmlowp__neon_mlal_padal);
#if XNN_ENABLE_FULL_BENCHMARKS
+ BENCHMARK_QS8_END2END(qs8_gemm_1x8c4__neon_mlal_padal_dup);
+ BENCHMARK_QS8_END2END(qs8_gemm_1x16c4__neon_mlal_padal_dup);
+#endif // XNN_ENABLE_FULL_BENCHMARKS
+ BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mlal_padal_dup);
+ BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mlal_padal_dup);
+ BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mlal_padal_dup);
+ BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mlal_padal_dup);
+ BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mlal_padal_dup);
+ BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mlal_padal_dup);
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+ BENCHMARK_QS8_END2END(qs8_gemm_1x8c4__neon_mull_padal_dup);
+ BENCHMARK_QS8_END2END(qs8_gemm_1x16c4__neon_mull_padal_dup);
+#endif // XNN_ENABLE_FULL_BENCHMARKS
+ BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mull_padal_dup);
+ BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mull_padal_dup);
+ BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mull_padal_dup);
+ BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mull_padal_dup);
+ BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mull_padal_dup);
+ BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mull_padal_dup);
+
+#if XNN_ENABLE_FULL_BENCHMARKS
BENCHMARK_QS8_END2END(qs8_gemm_1x8s4c2__neon_mull_padal);
BENCHMARK_QS8_END2END(qs8_gemm_1x8s4c2__neon_mlal_padal);
- BENCHMARK_QS8_END2END(qs8_gemm_1x8c2__neon_mlal_padal_dup);
- BENCHMARK_QS8_END2END(qs8_gemm_1x16c2__neon_mlal_padal_dup);
#endif // XNN_ENABLE_FULL_BENCHMARKS
BENCHMARK_QS8_END2END(qs8_gemm_2x8s4c2__neon_mull_padal);
BENCHMARK_QS8_END2END(qs8_gemm_2x8s4c2__neon_mlal_padal);
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+ BENCHMARK_QS8_END2END(qs8_gemm_1x8c2__neon_mlal_padal_dup);
+ BENCHMARK_QS8_END2END(qs8_gemm_1x16c2__neon_mlal_padal_dup);
+#endif // XNN_ENABLE_FULL_BENCHMARKS
BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mlal_padal_dup);
BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mlal_padal_dup);
BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mlal_padal_dup);
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index 430832e..a125e50 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -427,6 +427,70 @@
GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, 4, 16, 2, 1,
xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
}
+ static void qs8_gemm_1x8c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, 1, 8, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_2x8c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, 2, 8, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_3x8c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, 3, 8, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_4x8c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, 4, 8, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_1x16c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, 1, 16, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_2x16c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, 2, 16, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_3x16c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, 3, 16, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_4x16c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, 4, 16, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_1x8c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, 1, 8, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_2x8c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, 2, 8, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_3x8c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, 3, 8, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_4x8c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, 4, 8, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_1x16c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, 1, 16, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_2x16c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, 2, 16, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_3x16c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, 3, 16, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_4x16c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, 4, 16, 4, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
static void qs8_gemm_1x8c8_gemmlowp__neon_mull_padal(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, 1, 8, 8, 1,
xnn_init_qs8_conv_minmax_gemmlowp_neon_params, benchmark::utils::CheckNEON);
@@ -555,21 +619,26 @@
GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, 8, 16, 4, 1,
xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
}
-
- BENCHMARK_GEMM(qs8_gemm_1x8_gemmlowp__neon_mlal_lane)
- BENCHMARK_GEMM(qs8_gemm_2x8_gemmlowp__neon_mlal_lane)
- BENCHMARK_GEMM(qs8_gemm_3x8_gemmlowp__neon_mlal_lane)
- BENCHMARK_GEMM(qs8_gemm_4x8_gemmlowp__neon_mlal_lane)
- BENCHMARK_GEMM(qs8_gemm_6x8_gemmlowp__neon_mlal_lane)
- BENCHMARK_GEMM(qs8_gemm_1x16_gemmlowp__neon_mlal_lane)
- BENCHMARK_GEMM(qs8_gemm_2x16_gemmlowp__neon_mlal_lane)
- BENCHMARK_GEMM(qs8_gemm_3x16_gemmlowp__neon_mlal_lane)
- BENCHMARK_GEMM(qs8_gemm_4x16_gemmlowp__neon_mlal_lane)
- BENCHMARK_GEMM(qs8_gemm_6x16_gemmlowp__neon_mlal_lane)
BENCHMARK_GEMM(qs8_gemm_1x8s4c2__neon_mull_padal)
BENCHMARK_GEMM(qs8_gemm_2x8s4c2__neon_mull_padal)
BENCHMARK_GEMM(qs8_gemm_1x8s4c2__neon_mlal_padal)
BENCHMARK_GEMM(qs8_gemm_2x8s4c2__neon_mlal_padal)
+ BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mull_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mull_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mull_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mull_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mull_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mull_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mull_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mull_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mlal_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mlal_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mlal_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mlal_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mlal_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mlal_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mlal_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mlal_padal_dup)
BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_padal_dup)
BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_padal_dup)
BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_padal_dup)
@@ -586,6 +655,16 @@
BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_padal_dup)
BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_padal_dup)
BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_padal_dup)
+ BENCHMARK_GEMM(qs8_gemm_1x8_gemmlowp__neon_mlal_lane)
+ BENCHMARK_GEMM(qs8_gemm_2x8_gemmlowp__neon_mlal_lane)
+ BENCHMARK_GEMM(qs8_gemm_3x8_gemmlowp__neon_mlal_lane)
+ BENCHMARK_GEMM(qs8_gemm_4x8_gemmlowp__neon_mlal_lane)
+ BENCHMARK_GEMM(qs8_gemm_6x8_gemmlowp__neon_mlal_lane)
+ BENCHMARK_GEMM(qs8_gemm_1x16_gemmlowp__neon_mlal_lane)
+ BENCHMARK_GEMM(qs8_gemm_2x16_gemmlowp__neon_mlal_lane)
+ BENCHMARK_GEMM(qs8_gemm_3x16_gemmlowp__neon_mlal_lane)
+ BENCHMARK_GEMM(qs8_gemm_4x16_gemmlowp__neon_mlal_lane)
+ BENCHMARK_GEMM(qs8_gemm_6x16_gemmlowp__neon_mlal_lane)
BENCHMARK_GEMM(qs8_gemm_1x8c8_gemmlowp__neon_mull_padal)
BENCHMARK_GEMM(qs8_gemm_2x8c8_gemmlowp__neon_mull_padal)
BENCHMARK_GEMM(qs8_gemm_3x8c8_gemmlowp__neon_mull_padal)