QS8 C4 Neon GEMM and E2E benchmarks

PiperOrigin-RevId: 407932393
diff --git a/bench/qs8-gemm-e2e.cc b/bench/qs8-gemm-e2e.cc
index 5219a7a..d43ea03 100644
--- a/bench/qs8-gemm-e2e.cc
+++ b/bench/qs8-gemm-e2e.cc
@@ -419,6 +419,28 @@
       1 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
       benchmark::utils::CheckNEON);
   }
+
+  static void qs8_gemm_1x8c4__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params,
+      1 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_1x16c4__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params,
+      1 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
 #endif  // XNN_ENABLE_FULL_BENCHMARKS
 
   static void qs8_gemm_2x8s4c2__neon_mull_padal(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -509,6 +531,72 @@
       benchmark::utils::CheckNEON);
   }
 
+  static void qs8_gemm_2x8c4__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params,
+      2 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_2x16c4__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params,
+      2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_3x8c4__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params,
+      3 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_3x16c4__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params,
+      3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_4x8c4__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params,
+      4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_4x16c4__neon_mlal_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params,
+      4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
 #if XNN_ENABLE_FULL_BENCHMARKS
   static void qs8_gemm_1x8c2__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
     GEMMEnd2EndBenchmark(state, model,
@@ -531,6 +619,28 @@
       1 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
       benchmark::utils::CheckNEON);
   }
+
+  static void qs8_gemm_1x8c4__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params,
+      1 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_1x16c4__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params,
+      1 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
 #endif  // XNN_ENABLE_FULL_BENCHMARKS
 
   static void qs8_gemm_2x8c2__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -599,6 +709,72 @@
       benchmark::utils::CheckNEON);
   }
 
+  static void qs8_gemm_2x8c4__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params,
+      2 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_2x16c4__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params,
+      2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_3x8c4__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params,
+      3 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_3x16c4__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params,
+      3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_4x8c4__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params,
+      4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
+  static void qs8_gemm_4x16c4__neon_mull_padal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params,
+      4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+
   static void qs8_gemm_4x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
     GEMMEnd2EndBenchmark(state, model,
       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neondot,
@@ -971,13 +1147,38 @@
   BENCHMARK_QS8_END2END(qs8_gemm_4x16c16_gemmlowp__neon_mlal_padal);
 
 #if XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_1x8c4__neon_mlal_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_1x16c4__neon_mlal_padal_dup);
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mlal_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mlal_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mlal_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mlal_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mlal_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mlal_padal_dup);
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_1x8c4__neon_mull_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_1x16c4__neon_mull_padal_dup);
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mull_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mull_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mull_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mull_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mull_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mull_padal_dup);
+
+#if XNN_ENABLE_FULL_BENCHMARKS
   BENCHMARK_QS8_END2END(qs8_gemm_1x8s4c2__neon_mull_padal);
   BENCHMARK_QS8_END2END(qs8_gemm_1x8s4c2__neon_mlal_padal);
-  BENCHMARK_QS8_END2END(qs8_gemm_1x8c2__neon_mlal_padal_dup);
-  BENCHMARK_QS8_END2END(qs8_gemm_1x16c2__neon_mlal_padal_dup);
 #endif  // XNN_ENABLE_FULL_BENCHMARKS
   BENCHMARK_QS8_END2END(qs8_gemm_2x8s4c2__neon_mull_padal);
   BENCHMARK_QS8_END2END(qs8_gemm_2x8s4c2__neon_mlal_padal);
+
+#if XNN_ENABLE_FULL_BENCHMARKS
+  BENCHMARK_QS8_END2END(qs8_gemm_1x8c2__neon_mlal_padal_dup);
+  BENCHMARK_QS8_END2END(qs8_gemm_1x16c2__neon_mlal_padal_dup);
+#endif  // XNN_ENABLE_FULL_BENCHMARKS
   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mlal_padal_dup);
   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mlal_padal_dup);
   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mlal_padal_dup);
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index 430832e..a125e50 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -427,6 +427,70 @@
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_padal_dup, 4, 16, 2, 1,
       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
   }
+  static void qs8_gemm_1x8c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_padal_dup, 1, 8, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x8c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_padal_dup, 2, 8, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_3x8c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_padal_dup, 3, 8, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x8c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_padal_dup, 4, 8, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_1x16c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_padal_dup, 1, 16, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x16c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_padal_dup, 2, 16, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_3x16c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_padal_dup, 3, 16, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x16c4__neon_mull_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_padal_dup, 4, 16, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_1x8c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_padal_dup, 1, 8, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x8c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_padal_dup, 2, 8, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_3x8c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_padal_dup, 3, 8, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x8c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_padal_dup, 4, 8, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_1x16c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_padal_dup, 1, 16, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_2x16c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_padal_dup, 2, 16, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_3x16c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_padal_dup, 3, 16, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x16c4__neon_mlal_padal_dup(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_padal_dup, 4, 16, 4, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
   static void qs8_gemm_1x8c8_gemmlowp__neon_mull_padal(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__neon_mull_padal, 1, 8, 8, 1,
       xnn_init_qs8_conv_minmax_gemmlowp_neon_params, benchmark::utils::CheckNEON);
@@ -555,21 +619,26 @@
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, 8, 16, 4, 1,
       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
   }
-
-  BENCHMARK_GEMM(qs8_gemm_1x8_gemmlowp__neon_mlal_lane)
-  BENCHMARK_GEMM(qs8_gemm_2x8_gemmlowp__neon_mlal_lane)
-  BENCHMARK_GEMM(qs8_gemm_3x8_gemmlowp__neon_mlal_lane)
-  BENCHMARK_GEMM(qs8_gemm_4x8_gemmlowp__neon_mlal_lane)
-  BENCHMARK_GEMM(qs8_gemm_6x8_gemmlowp__neon_mlal_lane)
-  BENCHMARK_GEMM(qs8_gemm_1x16_gemmlowp__neon_mlal_lane)
-  BENCHMARK_GEMM(qs8_gemm_2x16_gemmlowp__neon_mlal_lane)
-  BENCHMARK_GEMM(qs8_gemm_3x16_gemmlowp__neon_mlal_lane)
-  BENCHMARK_GEMM(qs8_gemm_4x16_gemmlowp__neon_mlal_lane)
-  BENCHMARK_GEMM(qs8_gemm_6x16_gemmlowp__neon_mlal_lane)
   BENCHMARK_GEMM(qs8_gemm_1x8s4c2__neon_mull_padal)
   BENCHMARK_GEMM(qs8_gemm_2x8s4c2__neon_mull_padal)
   BENCHMARK_GEMM(qs8_gemm_1x8s4c2__neon_mlal_padal)
   BENCHMARK_GEMM(qs8_gemm_2x8s4c2__neon_mlal_padal)
+  BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mull_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mlal_padal_dup)
   BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_padal_dup)
   BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_padal_dup)
   BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_padal_dup)
@@ -586,6 +655,16 @@
   BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_padal_dup)
   BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_padal_dup)
   BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_padal_dup)
+  BENCHMARK_GEMM(qs8_gemm_1x8_gemmlowp__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_2x8_gemmlowp__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_3x8_gemmlowp__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_4x8_gemmlowp__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_6x8_gemmlowp__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_1x16_gemmlowp__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_2x16_gemmlowp__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_3x16_gemmlowp__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_4x16_gemmlowp__neon_mlal_lane)
+  BENCHMARK_GEMM(qs8_gemm_6x16_gemmlowp__neon_mlal_lane)
   BENCHMARK_GEMM(qs8_gemm_1x8c8_gemmlowp__neon_mull_padal)
   BENCHMARK_GEMM(qs8_gemm_2x8c8_gemmlowp__neon_mull_padal)
   BENCHMARK_GEMM(qs8_gemm_3x8c8_gemmlowp__neon_mull_padal)