Quantized GEMM/IGEMM microkernels bump kc to be a multiple of channels. Rewind A pointers by KC. Remove last partial channel of remainder code. Its now handled by main loop. PiperOrigin-RevId: 360231001

commit: 6d8ca7d88ead578661f47ce8c5c6c24b3edc4928 [log] [tgz]
author: Frank Barchard <fbarchard@google.com> Mon Mar 01 11:05:08 2021 -0800
committer: XNNPACK Team <xnnpack-github-robot@google.com> Mon Mar 01 11:06:15 2021 -0800
tree: 2bb651b2e2d8945e16cd5cc39996523012d076eb
parent: 02121caa363ea04fda5f79ef073cf4884ab35279 [diff]
diff --git a/src/qs8-gemm/MRx16c8-avx512skx.c.in b/src/qs8-gemm/MRx16c8-avx512skx.c.in
index 973c81c..f28a7db 100644
--- a/src/qs8-gemm/MRx16c8-avx512skx.c.in
+++ b/src/qs8-gemm/MRx16c8-avx512skx.c.in

@@ -12,6 +12,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 $GEMM_SUFFIX = "_xw" if VARIANT == "EXTENDED" else ""
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   $for M in range(1, MR):
@@ -196,10 +198,10 @@
         _mm_storeu_si128((__m128i*) c1, _mm256_extracti128_si256(vout01x0123456789ABCDEF, 1));
 
         $for M in range(MR):
-          a${M} = (const int8_t*) ((uintptr_t) a${M} - k);
+          c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
 
         $for M in range(MR):
-          c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+          a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
 
         nc -= 16;
       } else {

diff --git a/src/qs8-gemm/MRx4c2-sse.c.in b/src/qs8-gemm/MRx4c2-sse.c.in
index d48eaa5..53eb918 100644
--- a/src/qs8-gemm/MRx4c2-sse.c.in
+++ b/src/qs8-gemm/MRx4c2-sse.c.in

@@ -19,6 +19,7 @@
   #include <${SSE_HEADER}>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 $LOAD_SUFFIX = {"LD128": "_ld128", "LD64": "_ld64", "EXTENDED": ""}[VARIANT]
@@ -45,6 +46,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   $for M in range(1, MR):
@@ -205,27 +207,6 @@
             $else:
               vacc${M}x0123 = _mm_add_epi32(vacc${M}x0123,
                 _mm_madd_epi16(_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            $if VARIANT == "EXTENDED":
-              const __m128i vxb3 = _mm_load_si128((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8 * sizeof(int16_t));
-            $else:
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              $if SSE >= 4:
-                const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
-              $else:
-                const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-              w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            $for M in range(MR):
-              $if SSE == 5:
-                vacc${M}x0123 = _mm_maddd_epi16(
-                  _mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc${M}x0123);
-              $else:
-                vacc${M}x0123 = _mm_add_epi32(vacc${M}x0123,
-                  _mm_madd_epi16(_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -325,10 +306,10 @@
           *((uint32_t*) c${M}) = (uint32_t) _mm_cvtsi128_si32(vout);
 
       $for M in range(MR):
-        a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
+        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
 
       $for M in range(MR):
-        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+        a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
 
       nc -= 4;
     } else {

diff --git a/src/qs8-gemm/MRx4c8-sse.c.in b/src/qs8-gemm/MRx4c8-sse.c.in
index 2e8dbe5..9e98cfd 100644
--- a/src/qs8-gemm/MRx4c8-sse.c.in
+++ b/src/qs8-gemm/MRx4c8-sse.c.in

@@ -19,6 +19,7 @@
   #include <${SSE_HEADER}>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 $LOAD_SUFFIX = {"LD128": "_ld128", "LD64": "_ld64", "EXTENDED": ""}[VARIANT]
@@ -45,6 +46,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   $for M in range(1, MR):
@@ -244,10 +246,10 @@
           *((uint32_t*) c${M}) = (uint32_t) _mm_cvtsi128_si32(vout);
 
       $for M in range(MR):
-        a${M} = (const int8_t*) ((uintptr_t) a${M} - k);
+        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
 
       $for M in range(MR):
-        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+        a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
 
       nc -= 4;
     } else {

diff --git a/src/qs8-gemm/MRx4c8-wasmsimd.c.in b/src/qs8-gemm/MRx4c8-wasmsimd.c.in
index dbc36b2..a10c7dd 100644
--- a/src/qs8-gemm/MRx4c8-wasmsimd.c.in
+++ b/src/qs8-gemm/MRx4c8-wasmsimd.c.in

@@ -10,6 +10,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 $LOAD_SUFFIX = {"LD128": "_ld128", "LD64": "_ld64", "EXTENDED": ""}[VARIANT]
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   $for M in range(1, MR):
@@ -170,10 +172,10 @@
         *((float*) c${M}) = (float) wasm_f32x4_extract_lane(vout, ${M});
 
       $for M in range(MR):
-        a${M} = (const int8_t*) ((uintptr_t) a${M} - k);
+        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
 
       $for M in range(MR):
-        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+        a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
 
       nc -= 4;
     } else {

diff --git a/src/qs8-gemm/MRx8c8-avx2.c.in b/src/qs8-gemm/MRx8c8-avx2.c.in
index e403810..3f17e73 100644
--- a/src/qs8-gemm/MRx8c8-avx2.c.in
+++ b/src/qs8-gemm/MRx8c8-avx2.c.in

@@ -11,6 +11,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 $GEMM_SUFFIX = "_xw" if VARIANT == "EXTENDED" else ""
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   $for M in range(1, MR):
@@ -169,10 +171,10 @@
         _mm_storeh_pi((__m64*) c3, _mm_castsi128_ps(vout_hi));
 
       $for M in range(MR):
-        a${M} = (const int8_t*) ((uintptr_t) a${M} - k);
+        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
 
       $for M in range(MR):
-        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+        a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
 
       nc -= 8;
     } else {

diff --git a/src/qs8-gemm/MRxNRc4-neondot.c.in b/src/qs8-gemm/MRxNRc4-neondot.c.in
index c26b7d3..49b43d2 100644
--- a/src/qs8-gemm/MRxNRc4-neondot.c.in
+++ b/src/qs8-gemm/MRxNRc4-neondot.c.in

@@ -6,12 +6,12 @@
 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 $assert NR % 8 == 0
 $assert 8 <= NR <= 16
-
 #include <assert.h>
 
 #include <arm_neon.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_${MR}x${NR}c4__neondot(
@@ -29,7 +29,12 @@
   assert(mr <= ${MR});
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   $for M in range(1, MR):
@@ -82,7 +87,7 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 7 final positions of `k`
+    // Handle up to 6 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a ${MR}x4 block of activations.
       $for M in range(MR):
@@ -108,13 +113,8 @@
               vacc${M}x${ABC[N:N+4]} = vdotq_lane_s32(vacc${M}x${ABC[N:N+4]}, vb4567x${ABC[N:N+4]}, va${M}x01234567, 1);
       }
     }
-    // End of accumulation loop. The variable `kc` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    $for M in range(MR):
-      a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
 
     // Post-accumulation work
-
     const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
     const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
 
@@ -205,6 +205,9 @@
       $for M in range(MR):
         c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
 
+      $for M in range(MR):
+        a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
+
       nc -= ${NR};
     } else {
       // Final case where not all of the ${NR} columns fit in the destination.

diff --git a/src/qs8-gemm/c16-neon-mlal-padal.c.in b/src/qs8-gemm/c16-neon-mlal-padal.c.in
index a6594ef..ed65c1e 100644
--- a/src/qs8-gemm/c16-neon-mlal-padal.c.in
+++ b/src/qs8-gemm/c16-neon-mlal-padal.c.in

@@ -10,8 +10,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_${MR}x${NR}c16__neon_mlal_padal(
@@ -35,6 +35,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 16);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   $for M in range(1, MR):
@@ -63,7 +64,7 @@
       $for N in range(NR):
         int32x4_t vacc${M}x${N} = vacc0x${N};
 
-    // KC loop of 16 with up to 15 remainder
+    // KC loop of 16
     size_t k = 0;
     while (k < kc) {
       $for M in range(MR):
@@ -191,7 +192,7 @@
         c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
 
       $for M in range(MR):
-        a${M} = (const int8_t*) ((uintptr_t) a${M} - k);
+        a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
 
       nc -= ${NR};
     } else {

diff --git a/src/qs8-gemm/c2-neon-mull-padal-dup.c.in b/src/qs8-gemm/c2-neon-mull-padal-dup.c.in
index ba19f77..cbc793c 100644
--- a/src/qs8-gemm/c2-neon-mull-padal-dup.c.in
+++ b/src/qs8-gemm/c2-neon-mull-padal-dup.c.in

@@ -10,8 +10,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_${MR}x${NR}c2__neon_${"mlal" if MLA else "mull"}_padal_dup(
@@ -35,6 +35,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   $for M in range(1, MR):
@@ -135,16 +136,6 @@
             $for N in range(0, NR, 4):
               const int16x8_t vprod${M}x${ABC[N:N+4]}c2 = vmull_s8(vb${ABC[N:N+4]}c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), 2)));
               vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c2);
-
-          if (k > 6 * sizeof(int8_t)) {
-            $for N in range(0, NR, 4):
-              const int8x8_t vb${ABC[N:N+4]}c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            $for M in range(MR):
-              $for N in range(0, NR, 4):
-                const int16x8_t vprod${M}x${ABC[N:N+4]}c3 = vmull_s8(vb${ABC[N:N+4]}c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), 3)));
-                vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c3);
-          }
         }
       }
     }

diff --git a/src/qs8-gemm/c8-neon-mull-padal.c.in b/src/qs8-gemm/c8-neon-mull-padal.c.in
index 12edb18..205f650 100644
--- a/src/qs8-gemm/c8-neon-mull-padal.c.in
+++ b/src/qs8-gemm/c8-neon-mull-padal.c.in

@@ -10,8 +10,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_${MR}x${NR}c8__neon_mull_padal(
@@ -35,6 +35,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   $for M in range(1, MR):
@@ -210,7 +211,7 @@
         c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
 
       $for M in range(MR):
-        a${M} = (const int8_t*) ((uintptr_t) a${M} - (kc - k));
+        a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
 
       nc -= ${NR};
     } else {

diff --git a/src/qs8-gemm/gen/12x8c4-minmax-neondot.c b/src/qs8-gemm/gen/12x8c4-minmax-neondot.c
index 2dabf60..c1665fd 100644
--- a/src/qs8-gemm/gen/12x8c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/12x8c4-minmax-neondot.c

@@ -7,12 +7,12 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-
 #include <assert.h>
 
 #include <arm_neon.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_12x8c4__neondot(
@@ -30,7 +30,12 @@
   assert(mr <= 12);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -205,7 +210,7 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 7 final positions of `k`
+    // Handle up to 6 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 12x4 block of activations.
       const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
@@ -283,23 +288,8 @@
         vacc11x4567 = vdotq_lane_s32(vacc11x4567, vb4567x4567, va11x01234567, 1);
       }
     }
-    // End of accumulation loop. The variable `kc` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-    a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-    a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-    a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-    a4 = (const int8_t*) ((uintptr_t) a4 - kc);
-    a5 = (const int8_t*) ((uintptr_t) a5 - kc);
-    a6 = (const int8_t*) ((uintptr_t) a6 - kc);
-    a7 = (const int8_t*) ((uintptr_t) a7 - kc);
-    a8 = (const int8_t*) ((uintptr_t) a8 - kc);
-    a9 = (const int8_t*) ((uintptr_t) a9 - kc);
-    a10 = (const int8_t*) ((uintptr_t) a10 - kc);
-    a11 = (const int8_t*) ((uintptr_t) a11 - kc);
 
     // Post-accumulation work
-
     const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
     const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
 
@@ -466,6 +456,19 @@
       c10 = (int8_t*) ((uintptr_t) c10 + cn_stride);
       c11 = (int8_t*) ((uintptr_t) c11 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+      a4 = (const int8_t*) ((uintptr_t) a4 - kc);
+      a5 = (const int8_t*) ((uintptr_t) a5 - kc);
+      a6 = (const int8_t*) ((uintptr_t) a6 - kc);
+      a7 = (const int8_t*) ((uintptr_t) a7 - kc);
+      a8 = (const int8_t*) ((uintptr_t) a8 - kc);
+      a9 = (const int8_t*) ((uintptr_t) a9 - kc);
+      a10 = (const int8_t*) ((uintptr_t) a10 - kc);
+      a11 = (const int8_t*) ((uintptr_t) a11 - kc);
+
       nc -= 8;
     } else {
       // Final case where not all of the 8 columns fit in the destination.

diff --git a/src/qs8-gemm/gen/1x16c16-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/1x16c16-minmax-neon-mlal-padal.c
index a3dd986..ce7d5d9 100644
--- a/src/qs8-gemm/gen/1x16c16-minmax-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/1x16c16-minmax-neon-mlal-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 16);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -57,7 +58,7 @@
     int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
     int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
 
-    // KC loop of 16 with up to 15 remainder
+    // KC loop of 16
     size_t k = 0;
     while (k < kc) {
       const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
@@ -216,7 +217,7 @@
 
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
 
       nc -= 16;
     } else {

diff --git a/src/qs8-gemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c
index 43eec29..7c5f25d 100644
--- a/src/qs8-gemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c
+++ b/src/qs8-gemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -238,22 +239,6 @@
           vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
           const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
           vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-            const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-            const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
-            const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
-          }
         }
       }
     }

diff --git a/src/qs8-gemm/gen/1x16c2-minmax-neon-mull-padal-dup.c b/src/qs8-gemm/gen/1x16c2-minmax-neon-mull-padal-dup.c
index a862b80..8447491 100644
--- a/src/qs8-gemm/gen/1x16c2-minmax-neon-mull-padal-dup.c
+++ b/src/qs8-gemm/gen/1x16c2-minmax-neon-mull-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -150,22 +151,6 @@
           vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
           const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
           vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-            const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-            const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
-            const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
-          }
         }
       }
     }

diff --git a/src/qs8-gemm/gen/1x16c4-minmax-neondot.c b/src/qs8-gemm/gen/1x16c4-minmax-neondot.c
index 86441dd..964525e 100644
--- a/src/qs8-gemm/gen/1x16c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/1x16c4-minmax-neondot.c

@@ -7,12 +7,12 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-
 #include <assert.h>
 
 #include <arm_neon.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot(
@@ -30,7 +30,12 @@
   assert(mr <= 1);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -72,7 +77,7 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 7 final positions of `k`
+    // Handle up to 6 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 1x4 block of activations.
       const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
@@ -103,12 +108,8 @@
         vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
       }
     }
-    // End of accumulation loop. The variable `kc` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const int8_t*) ((uintptr_t) a0 - kc);
 
     // Post-accumulation work
-
     const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
     const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
 
@@ -153,6 +154,8 @@
       // Advance to the next 16 columns.
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 16;
     } else {
       // Final case where not all of the 16 columns fit in the destination.

diff --git a/src/qs8-gemm/gen/1x16c8-minmax-avx512skx.c b/src/qs8-gemm/gen/1x16c8-minmax-avx512skx.c
index e03b182..52ac712 100644
--- a/src/qs8-gemm/gen/1x16c8-minmax-avx512skx.c
+++ b/src/qs8-gemm/gen/1x16c8-minmax-avx512skx.c

@@ -13,6 +13,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx(
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 

diff --git a/src/qs8-gemm/gen/1x16c8-minmax-neon-mull-padal.c b/src/qs8-gemm/gen/1x16c8-minmax-neon-mull-padal.c
index 880e8f2..a345ea0 100644
--- a/src/qs8-gemm/gen/1x16c8-minmax-neon-mull-padal.c
+++ b/src/qs8-gemm/gen/1x16c8-minmax-neon-mull-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -291,7 +292,7 @@
 
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - (kc - k));
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
 
       nc -= 16;
     } else {

diff --git a/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c
index 36d08bb..f0c02e1 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c2__sse2_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -99,15 +101,6 @@
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -163,10 +156,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c
index 96aae54..1b28605 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c2__sse2_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -99,15 +101,6 @@
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -163,10 +156,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c
index 45fee98..11eafd3 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -99,15 +101,6 @@
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -147,10 +140,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld64.c
index de32f50..d1bcbd1 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld64.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -99,15 +101,6 @@
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -147,10 +140,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c
index 9e517ba..bdb39b9 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c2__ssse3_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -99,15 +101,6 @@
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -163,10 +156,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld64.c
index 7529af6..11f165f 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld64.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c2__ssse3_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -99,15 +101,6 @@
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -163,10 +156,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c
index 464f22f..7473e30 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -104,15 +106,6 @@
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
-          }
         }
       }
     }
@@ -152,10 +145,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c
index 8038248..ece3f81 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -104,15 +106,6 @@
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
-          }
         }
       }
     }
@@ -152,10 +145,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-sse2.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-sse2.c
index 70c7dd0..6c6dd0d 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-sse2.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-sse2.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse2(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -92,14 +94,6 @@
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vxb3 = _mm_load_si128((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int16_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -155,10 +149,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-sse41.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-sse41.c
index d5fa2c3..2783338 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-sse41.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-sse41.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -92,14 +94,6 @@
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vxb3 = _mm_load_si128((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int16_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -139,10 +133,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-ssse3.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-ssse3.c
index 40b3ea8..6019e9c 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-ssse3.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-ssse3.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__ssse3(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -92,14 +94,6 @@
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vxb3 = _mm_load_si128((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int16_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -155,10 +149,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-xop.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-xop.c
index 624ac2b..b4a2490 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-xop.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-xop.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -97,14 +99,6 @@
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vxb3 = _mm_load_si128((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int16_t));
-
-            vacc0x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
-          }
         }
       }
     }
@@ -144,10 +138,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c
index 87d81ff..135dcdb 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -126,10 +128,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c
index 3467c1e..15bd4fd 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -128,10 +130,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c
index e7b623c..df8bf83 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -110,10 +112,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c
index 6db42e2..f2e0e3a 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -112,10 +114,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c
index d0323dd..5f90494 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -126,10 +128,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c
index c8cd8ad..7031314 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -128,10 +130,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld128.c
index ba3e4f1..e14c5d1 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld128.c

@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -119,10 +121,10 @@
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld64.c
index 428426a..3ea0022 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld64.c

@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -115,10 +117,10 @@
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c
index e89b0a2..5411da3 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -115,10 +117,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c
index f281502..6bab96c 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -117,10 +119,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-sse2.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-sse2.c
index 84cd542..4d50667 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-sse2.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-sse2.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse2(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -124,10 +126,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-sse41.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-sse41.c
index 59b547c..3dc5959 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-sse41.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-sse41.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__sse41(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -108,10 +110,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-ssse3.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-ssse3.c
index 6e1f9c9..31d4fc5 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-ssse3.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-ssse3.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__ssse3(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -124,10 +126,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-wasmsimd.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-wasmsimd.c
index 7cc7643..ffb826b 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-wasmsimd.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-wasmsimd.c

@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__wasmsimd(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -115,10 +117,10 @@
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-xop.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-xop.c
index 29349f9..d4f4020 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-xop.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-xop.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__xop(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -113,10 +115,10 @@
     if (nc >= 4) {
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/1x8c16-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/1x8c16-minmax-neon-mlal-padal.c
index 5b7e972..5bf0eca 100644
--- a/src/qs8-gemm/gen/1x8c16-minmax-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/1x8c16-minmax-neon-mlal-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 16);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -49,7 +50,7 @@
     int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
     int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
 
-    // KC loop of 16 with up to 15 remainder
+    // KC loop of 16
     size_t k = 0;
     while (k < kc) {
       const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
@@ -148,7 +149,7 @@
 
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
 
       nc -= 8;
     } else {

diff --git a/src/qs8-gemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c
index 01eea15..378b1b4 100644
--- a/src/qs8-gemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c
+++ b/src/qs8-gemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -154,16 +155,6 @@
           vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
           const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
           vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-            const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-          }
         }
       }
     }

diff --git a/src/qs8-gemm/gen/1x8c2-minmax-neon-mull-padal-dup.c b/src/qs8-gemm/gen/1x8c2-minmax-neon-mull-padal-dup.c
index 26b3be1..9576c96 100644
--- a/src/qs8-gemm/gen/1x8c2-minmax-neon-mull-padal-dup.c
+++ b/src/qs8-gemm/gen/1x8c2-minmax-neon-mull-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mull_padal_dup(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -106,16 +107,6 @@
           vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
           const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
           vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-            const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-          }
         }
       }
     }

diff --git a/src/qs8-gemm/gen/1x8c4-minmax-neondot.c b/src/qs8-gemm/gen/1x8c4-minmax-neondot.c
index 5fc71c0..8242be7 100644
--- a/src/qs8-gemm/gen/1x8c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/1x8c4-minmax-neondot.c

@@ -7,12 +7,12 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-
 #include <assert.h>
 
 #include <arm_neon.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot(
@@ -30,7 +30,12 @@
   assert(mr <= 1);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -62,7 +67,7 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 7 final positions of `k`
+    // Handle up to 6 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 1x4 block of activations.
       const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
@@ -85,12 +90,8 @@
         vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
       }
     }
-    // End of accumulation loop. The variable `kc` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const int8_t*) ((uintptr_t) a0 - kc);
 
     // Post-accumulation work
-
     const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
     const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
 
@@ -127,6 +128,8 @@
       // Advance to the next 8 columns.
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 8;
     } else {
       // Final case where not all of the 8 columns fit in the destination.

diff --git a/src/qs8-gemm/gen/1x8c8-minmax-avx2.c b/src/qs8-gemm/gen/1x8c8-minmax-avx2.c
index 29b2e86..adbc9b5 100644
--- a/src/qs8-gemm/gen/1x8c8-minmax-avx2.c
+++ b/src/qs8-gemm/gen/1x8c8-minmax-avx2.c

@@ -13,6 +13,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2(
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -128,10 +130,10 @@
     if (nc >= 8) {
       _mm_storel_epi64((__m128i*) c0, vout_lo);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 8;
     } else {
       if (nc & 4) {

diff --git a/src/qs8-gemm/gen/1x8c8-minmax-neon-mull-padal.c b/src/qs8-gemm/gen/1x8c8-minmax-neon-mull-padal.c
index cdac06c..5fb7fa6 100644
--- a/src/qs8-gemm/gen/1x8c8-minmax-neon-mull-padal.c
+++ b/src/qs8-gemm/gen/1x8c8-minmax-neon-mull-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mull_padal(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -191,7 +192,7 @@
 
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - (kc - k));
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
 
       nc -= 8;
     } else {

diff --git a/src/qs8-gemm/gen/1x8c8-xw-minmax-avx2.c b/src/qs8-gemm/gen/1x8c8-xw-minmax-avx2.c
index e039738..9d51b83 100644
--- a/src/qs8-gemm/gen/1x8c8-xw-minmax-avx2.c
+++ b/src/qs8-gemm/gen/1x8c8-xw-minmax-avx2.c

@@ -13,6 +13,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_1x8c8__avx2(
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -124,10 +126,10 @@
     if (nc >= 8) {
       _mm_storel_epi64((__m128i*) c0, vout_lo);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 8;
     } else {
       if (nc & 4) {

diff --git a/src/qs8-gemm/gen/2x16c16-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/2x16c16-minmax-neon-mlal-padal.c
index ba85d48..bab89e5 100644
--- a/src/qs8-gemm/gen/2x16c16-minmax-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/2x16c16-minmax-neon-mlal-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 16);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -79,7 +80,7 @@
     int32x4_t vacc1x14 = vacc0x14;
     int32x4_t vacc1x15 = vacc0x15;
 
-    // KC loop of 16 with up to 15 remainder
+    // KC loop of 16
     size_t k = 0;
     while (k < kc) {
       const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
@@ -349,8 +350,8 @@
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
 
       nc -= 16;
     } else {

diff --git a/src/qs8-gemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c
index 91f2a9d..cbb3d94 100644
--- a/src/qs8-gemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c
+++ b/src/qs8-gemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -356,30 +357,6 @@
           vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
           const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
           vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-            const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-            const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
-            const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
-            const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-            const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-            const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
-            const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
-          }
         }
       }
     }

diff --git a/src/qs8-gemm/gen/2x16c2-minmax-neon-mull-padal-dup.c b/src/qs8-gemm/gen/2x16c2-minmax-neon-mull-padal-dup.c
index 4943ccd..3cc7e4e 100644
--- a/src/qs8-gemm/gen/2x16c2-minmax-neon-mull-padal-dup.c
+++ b/src/qs8-gemm/gen/2x16c2-minmax-neon-mull-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -218,30 +219,6 @@
           vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
           const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
           vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-            const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-            const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
-            const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
-            const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-            const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-            const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
-            const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
-          }
         }
       }
     }

diff --git a/src/qs8-gemm/gen/2x16c8-minmax-avx512skx.c b/src/qs8-gemm/gen/2x16c8-minmax-avx512skx.c
index bfc594b..1e18779 100644
--- a/src/qs8-gemm/gen/2x16c8-minmax-avx512skx.c
+++ b/src/qs8-gemm/gen/2x16c8-minmax-avx512skx.c

@@ -13,6 +13,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x16c8__avx512skx(
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -145,12 +147,12 @@
       _mm_storeu_si128((__m128i*) c0, _mm256_castsi256_si128(vout01x0123456789ABCDEF));
       _mm_storeu_si128((__m128i*) c1, _mm256_extracti128_si256(vout01x0123456789ABCDEF, 1));
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 16;
     } else {
       // Prepare mask for valid 8-bit elements (depends on nc).

diff --git a/src/qs8-gemm/gen/2x16c8-minmax-neon-mull-padal.c b/src/qs8-gemm/gen/2x16c8-minmax-neon-mull-padal.c
index 7b6d7c6..2c0c623 100644
--- a/src/qs8-gemm/gen/2x16c8-minmax-neon-mull-padal.c
+++ b/src/qs8-gemm/gen/2x16c8-minmax-neon-mull-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -458,8 +459,8 @@
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - (kc - k));
-      a1 = (const int8_t*) ((uintptr_t) a1 - (kc - k));
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
 
       nc -= 16;
     } else {

diff --git a/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c
index 2dea0d2..d3f8bc0 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x4c8__sse2_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -166,12 +168,12 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c
index 52c118d..61a1344 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x4c8__sse2_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -168,12 +170,12 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c
index 166947b..94bd5be 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -141,12 +143,12 @@
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c
index 139c4cf..3765c44 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -143,12 +145,12 @@
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c
index 0d0262a..10307ca 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x4c8__ssse3_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -166,12 +168,12 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c
index 08b9f64..59cf669 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x4c8__ssse3_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -168,12 +170,12 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld128.c
index a77acde..2ad04a9 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld128.c

@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -155,12 +157,12 @@
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld64.c
index c3f5483..a00ad6c 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-wasmsimd-ld64.c

@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -151,12 +153,12 @@
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c
index 83ccb6a..610ecd3 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -146,12 +148,12 @@
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c
index e918a7a..5ef963f 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -148,12 +150,12 @@
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-sse2.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-sse2.c
index 38cc155..5336fcb 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-sse2.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-sse2.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse2(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -164,12 +166,12 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-sse41.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-sse41.c
index 0202b27..c6cca2c 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-sse41.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-sse41.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -139,12 +141,12 @@
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-ssse3.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-ssse3.c
index 79f8767..521105b 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-ssse3.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-ssse3.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__ssse3(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -164,12 +166,12 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-wasmsimd.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-wasmsimd.c
index 00a234e..2a11031 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-wasmsimd.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-wasmsimd.c

@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__wasmsimd(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -151,12 +153,12 @@
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-xop.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-xop.c
index 1969160..6e41cc5 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-xop.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-xop.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -144,12 +146,12 @@
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/2x8c16-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/2x8c16-minmax-neon-mlal-padal.c
index 269d94d..a2d58a2 100644
--- a/src/qs8-gemm/gen/2x8c16-minmax-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/2x8c16-minmax-neon-mlal-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 16);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -63,7 +64,7 @@
     int32x4_t vacc1x6 = vacc0x6;
     int32x4_t vacc1x7 = vacc0x7;
 
-    // KC loop of 16 with up to 15 remainder
+    // KC loop of 16
     size_t k = 0;
     while (k < kc) {
       const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
@@ -217,8 +218,8 @@
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
 
       nc -= 8;
     } else {

diff --git a/src/qs8-gemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c
index e3d3570..850d04d 100644
--- a/src/qs8-gemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c
+++ b/src/qs8-gemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -218,20 +219,6 @@
           vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
           const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
           vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-            const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-            const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-            const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-          }
         }
       }
     }

diff --git a/src/qs8-gemm/gen/2x8c2-minmax-neon-mull-padal-dup.c b/src/qs8-gemm/gen/2x8c2-minmax-neon-mull-padal-dup.c
index 5d4e735..a98d1e9 100644
--- a/src/qs8-gemm/gen/2x8c2-minmax-neon-mull-padal-dup.c
+++ b/src/qs8-gemm/gen/2x8c2-minmax-neon-mull-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mull_padal_dup(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -144,20 +145,6 @@
           vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
           const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
           vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-            const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-            const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-            const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-          }
         }
       }
     }

diff --git a/src/qs8-gemm/gen/2x8c8-minmax-avx2.c b/src/qs8-gemm/gen/2x8c8-minmax-avx2.c
index 677a96b..f449528 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-avx2.c
+++ b/src/qs8-gemm/gen/2x8c8-minmax-avx2.c

@@ -13,6 +13,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2(
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -160,12 +162,12 @@
       _mm_storel_epi64((__m128i*) c0, vout_lo);
       _mm_storel_epi64((__m128i*) c1, vout_hi);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 8;
     } else {
       if (nc & 4) {

diff --git a/src/qs8-gemm/gen/2x8c8-minmax-neon-mull-padal.c b/src/qs8-gemm/gen/2x8c8-minmax-neon-mull-padal.c
index ac2af6c..c98b99c 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-neon-mull-padal.c
+++ b/src/qs8-gemm/gen/2x8c8-minmax-neon-mull-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -278,8 +279,8 @@
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - (kc - k));
-      a1 = (const int8_t*) ((uintptr_t) a1 - (kc - k));
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
 
       nc -= 8;
     } else {

diff --git a/src/qs8-gemm/gen/2x8c8-xw-minmax-avx2.c b/src/qs8-gemm/gen/2x8c8-xw-minmax-avx2.c
index bc7a3b3..e228916 100644
--- a/src/qs8-gemm/gen/2x8c8-xw-minmax-avx2.c
+++ b/src/qs8-gemm/gen/2x8c8-xw-minmax-avx2.c

@@ -13,6 +13,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_2x8c8__avx2(
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -156,12 +158,12 @@
       _mm_storel_epi64((__m128i*) c0, vout_lo);
       _mm_storel_epi64((__m128i*) c1, vout_hi);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
       nc -= 8;
     } else {
       if (nc & 4) {

diff --git a/src/qs8-gemm/gen/3x16c16-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/3x16c16-minmax-neon-mlal-padal.c
index c15c35e..8dd6360 100644
--- a/src/qs8-gemm/gen/3x16c16-minmax-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/3x16c16-minmax-neon-mlal-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 16);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -101,7 +102,7 @@
     int32x4_t vacc2x14 = vacc0x14;
     int32x4_t vacc2x15 = vacc0x15;
 
-    // KC loop of 16 with up to 15 remainder
+    // KC loop of 16
     size_t k = 0;
     while (k < kc) {
       const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
@@ -482,9 +483,9 @@
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
 
       nc -= 16;
     } else {

diff --git a/src/qs8-gemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c
index 7750eb3..e3d71f4 100644
--- a/src/qs8-gemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c
+++ b/src/qs8-gemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -474,38 +475,6 @@
           vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
           const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
           vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-            const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-            const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
-            const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
-            const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-            const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-            const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
-            const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
-            const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
-            const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
-            const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
-            const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
-          }
         }
       }
     }

diff --git a/src/qs8-gemm/gen/3x16c2-minmax-neon-mull-padal-dup.c b/src/qs8-gemm/gen/3x16c2-minmax-neon-mull-padal-dup.c
index bb497f5..68671c7 100644
--- a/src/qs8-gemm/gen/3x16c2-minmax-neon-mull-padal-dup.c
+++ b/src/qs8-gemm/gen/3x16c2-minmax-neon-mull-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mull_padal_dup(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -286,38 +287,6 @@
           vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
           const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
           vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-            const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-            const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
-            const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
-            const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-            const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-            const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
-            const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
-            const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
-            const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
-            const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
-            const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
-          }
         }
       }
     }

diff --git a/src/qs8-gemm/gen/3x16c8-minmax-avx512skx.c b/src/qs8-gemm/gen/3x16c8-minmax-avx512skx.c
index b5e91ec..9970bc3 100644
--- a/src/qs8-gemm/gen/3x16c8-minmax-avx512skx.c
+++ b/src/qs8-gemm/gen/3x16c8-minmax-avx512skx.c

@@ -13,6 +13,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x16c8__avx512skx(
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);

diff --git a/src/qs8-gemm/gen/3x16c8-minmax-neon-mull-padal.c b/src/qs8-gemm/gen/3x16c8-minmax-neon-mull-padal.c
index 3b9b1ec..28ac128 100644
--- a/src/qs8-gemm/gen/3x16c8-minmax-neon-mull-padal.c
+++ b/src/qs8-gemm/gen/3x16c8-minmax-neon-mull-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -625,9 +626,9 @@
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - (kc - k));
-      a1 = (const int8_t*) ((uintptr_t) a1 - (kc - k));
-      a2 = (const int8_t*) ((uintptr_t) a2 - (kc - k));
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
 
       nc -= 16;
     } else {

diff --git a/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld128.c
index 709ef30..3e40c43 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld128.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x4c8__sse2_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -208,14 +210,14 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld64.c
index c01210e..ec8e708 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-sse2-ld64.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x4c8__sse2_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -210,14 +212,14 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld128.c
index fe38e93..eefed28 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld128.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -174,14 +176,14 @@
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld64.c
index f511ba1..1fac46b 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-sse41-ld64.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -176,14 +178,14 @@
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld128.c
index 166acff..d5dedb7 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld128.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x4c8__ssse3_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -208,14 +210,14 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld64.c
index 255c8e2..661e5ec 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld64.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x4c8__ssse3_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -210,14 +212,14 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld128.c
index 8a037ed..138f883 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld128.c

@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -192,14 +194,14 @@
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
       *((float*) c2) = (float) wasm_f32x4_extract_lane(vout, 2);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld64.c
index f63c0f0..63303a9 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-wasmsimd-ld64.c

@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -188,14 +190,14 @@
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
       *((float*) c2) = (float) wasm_f32x4_extract_lane(vout, 2);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/3x4c8-minmax-xop-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-xop-ld128.c
index f3beb05..df0e5d5 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-xop-ld128.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -179,14 +181,14 @@
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/3x4c8-minmax-xop-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-xop-ld64.c
index f82a244..090bd73 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-xop-ld64.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -181,14 +183,14 @@
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-sse2.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-sse2.c
index 1da482f..eb77d6e 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-sse2.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-sse2.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse2(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -206,14 +208,14 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-sse41.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-sse41.c
index 27ec277..167ab31 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-sse41.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-sse41.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -172,14 +174,14 @@
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-ssse3.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-ssse3.c
index 93fe44b..d299855 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-ssse3.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-ssse3.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__ssse3(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -206,14 +208,14 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-wasmsimd.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-wasmsimd.c
index 4258337..7cc2d01 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-wasmsimd.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-wasmsimd.c

@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -188,14 +190,14 @@
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
       *((float*) c2) = (float) wasm_f32x4_extract_lane(vout, 2);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-xop.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-xop.c
index 0f8932b..79cdbf1 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-xop.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-xop.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -177,14 +179,14 @@
       *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/3x8c16-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/3x8c16-minmax-neon-mlal-padal.c
index 079d6d2..0b18356 100644
--- a/src/qs8-gemm/gen/3x8c16-minmax-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/3x8c16-minmax-neon-mlal-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 16);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -77,7 +78,7 @@
     int32x4_t vacc2x6 = vacc0x6;
     int32x4_t vacc2x7 = vacc0x7;
 
-    // KC loop of 16 with up to 15 remainder
+    // KC loop of 16
     size_t k = 0;
     while (k < kc) {
       const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
@@ -290,9 +291,9 @@
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
 
       nc -= 8;
     } else {

diff --git a/src/qs8-gemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c
index 7f095a9..4d90e3a 100644
--- a/src/qs8-gemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c
+++ b/src/qs8-gemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -282,24 +283,6 @@
           vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
           const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
           vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-            const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-            const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-            const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-            const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
-            const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
-          }
         }
       }
     }

diff --git a/src/qs8-gemm/gen/3x8c2-minmax-neon-mull-padal-dup.c b/src/qs8-gemm/gen/3x8c2-minmax-neon-mull-padal-dup.c
index 2a8d3d90..cdd6df2 100644
--- a/src/qs8-gemm/gen/3x8c2-minmax-neon-mull-padal-dup.c
+++ b/src/qs8-gemm/gen/3x8c2-minmax-neon-mull-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mull_padal_dup(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -182,24 +183,6 @@
           vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
           const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
           vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-            const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-            const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-            const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-            const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
-            const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
-          }
         }
       }
     }

diff --git a/src/qs8-gemm/gen/3x8c8-minmax-avx2.c b/src/qs8-gemm/gen/3x8c8-minmax-avx2.c
index f4d3c06..d703f98 100644
--- a/src/qs8-gemm/gen/3x8c8-minmax-avx2.c
+++ b/src/qs8-gemm/gen/3x8c8-minmax-avx2.c

@@ -13,6 +13,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2(
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -195,14 +197,14 @@
       _mm_storel_epi64((__m128i*) c1, vout_hi);
       _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 8;
     } else {
       if (nc & 4) {

diff --git a/src/qs8-gemm/gen/3x8c8-minmax-neon-mull-padal.c b/src/qs8-gemm/gen/3x8c8-minmax-neon-mull-padal.c
index b1c926f..e905075 100644
--- a/src/qs8-gemm/gen/3x8c8-minmax-neon-mull-padal.c
+++ b/src/qs8-gemm/gen/3x8c8-minmax-neon-mull-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -369,9 +370,9 @@
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - (kc - k));
-      a1 = (const int8_t*) ((uintptr_t) a1 - (kc - k));
-      a2 = (const int8_t*) ((uintptr_t) a2 - (kc - k));
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
 
       nc -= 8;
     } else {

diff --git a/src/qs8-gemm/gen/3x8c8-xw-minmax-avx2.c b/src/qs8-gemm/gen/3x8c8-xw-minmax-avx2.c
index 370b8d7..290c6af 100644
--- a/src/qs8-gemm/gen/3x8c8-xw-minmax-avx2.c
+++ b/src/qs8-gemm/gen/3x8c8-xw-minmax-avx2.c

@@ -13,6 +13,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_3x8c8__avx2(
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -191,14 +193,14 @@
       _mm_storel_epi64((__m128i*) c1, vout_hi);
       _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
       nc -= 8;
     } else {
       if (nc & 4) {

diff --git a/src/qs8-gemm/gen/4x16c16-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/4x16c16-minmax-neon-mlal-padal.c
index 9ad5300..53cbee3 100644
--- a/src/qs8-gemm/gen/4x16c16-minmax-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/4x16c16-minmax-neon-mlal-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 16);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -123,7 +124,7 @@
     int32x4_t vacc3x14 = vacc0x14;
     int32x4_t vacc3x15 = vacc0x15;
 
-    // KC loop of 16 with up to 15 remainder
+    // KC loop of 16
     size_t k = 0;
     while (k < kc) {
       const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
@@ -615,10 +616,10 @@
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-      a3 = (const int8_t*) ((uintptr_t) a3 - k);
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
 
       nc -= 16;
     } else {

diff --git a/src/qs8-gemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c
index 19d1d86..c8685b2 100644
--- a/src/qs8-gemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c
+++ b/src/qs8-gemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -592,46 +593,6 @@
           vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
           const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
           vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-            const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-            const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
-            const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
-            const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-            const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-            const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
-            const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
-            const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
-            const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
-            const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
-            const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
-            const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-            vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
-            const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-            vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
-            const int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-            vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
-            const int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-            vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
-          }
         }
       }
     }

diff --git a/src/qs8-gemm/gen/4x16c2-minmax-neon-mull-padal-dup.c b/src/qs8-gemm/gen/4x16c2-minmax-neon-mull-padal-dup.c
index bcd0072..ea9270c 100644
--- a/src/qs8-gemm/gen/4x16c2-minmax-neon-mull-padal-dup.c
+++ b/src/qs8-gemm/gen/4x16c2-minmax-neon-mull-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mull_padal_dup(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -354,46 +355,6 @@
           vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
           const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
           vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-            const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-            const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
-            const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
-            const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-            const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-            const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
-            const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
-            const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
-            const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
-            const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
-            const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
-            const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-            vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
-            const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-            vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
-            const int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-            vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
-            const int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-            vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
-          }
         }
       }
     }

diff --git a/src/qs8-gemm/gen/4x16c4-minmax-neondot.c b/src/qs8-gemm/gen/4x16c4-minmax-neondot.c
index 9b16993..719db22 100644
--- a/src/qs8-gemm/gen/4x16c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/4x16c4-minmax-neondot.c

@@ -7,12 +7,12 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-
 #include <assert.h>
 
 #include <arm_neon.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot(
@@ -30,7 +30,12 @@
   assert(mr <= 4);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -129,7 +134,7 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 7 final positions of `k`
+    // Handle up to 6 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 4x4 block of activations.
       const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
@@ -187,15 +192,8 @@
         vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
       }
     }
-    // End of accumulation loop. The variable `kc` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-    a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-    a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-    a3 = (const int8_t*) ((uintptr_t) a3 - kc);
 
     // Post-accumulation work
-
     const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
     const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
 
@@ -306,6 +304,11 @@
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 16;
     } else {
       // Final case where not all of the 16 columns fit in the destination.

diff --git a/src/qs8-gemm/gen/4x16c8-minmax-avx512skx.c b/src/qs8-gemm/gen/4x16c8-minmax-avx512skx.c
index 0ded25f..afa0f41 100644
--- a/src/qs8-gemm/gen/4x16c8-minmax-avx512skx.c
+++ b/src/qs8-gemm/gen/4x16c8-minmax-avx512skx.c

@@ -13,6 +13,7 @@
 
 #include <xnnpack/gemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx(
@@ -36,6 +37,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);

diff --git a/src/qs8-gemm/gen/4x16c8-minmax-neon-mull-padal.c b/src/qs8-gemm/gen/4x16c8-minmax-neon-mull-padal.c
index 9121043..ed64d8b 100644
--- a/src/qs8-gemm/gen/4x16c8-minmax-neon-mull-padal.c
+++ b/src/qs8-gemm/gen/4x16c8-minmax-neon-mull-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -792,10 +793,10 @@
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - (kc - k));
-      a1 = (const int8_t*) ((uintptr_t) a1 - (kc - k));
-      a2 = (const int8_t*) ((uintptr_t) a2 - (kc - k));
-      a3 = (const int8_t*) ((uintptr_t) a3 - (kc - k));
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
 
       nc -= 16;
     } else {

diff --git a/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c
index 88b2a44..fc682f9 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -180,21 +182,6 @@
             _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
           vacc3x0123 = _mm_add_epi32(vacc3x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc1x0123 = _mm_add_epi32(vacc1x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc2x0123 = _mm_add_epi32(vacc2x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc3x0123 = _mm_add_epi32(vacc3x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -312,16 +299,16 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c
index bd28fe4..4e99dd2 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x4c2__sse2_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -180,21 +182,6 @@
             _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
           vacc3x0123 = _mm_add_epi32(vacc3x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc1x0123 = _mm_add_epi32(vacc1x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc2x0123 = _mm_add_epi32(vacc2x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc3x0123 = _mm_add_epi32(vacc3x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -312,16 +299,16 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c
index 8395f05..49935f0 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -180,21 +182,6 @@
             _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
           vacc3x0123 = _mm_add_epi32(vacc3x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc1x0123 = _mm_add_epi32(vacc1x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc2x0123 = _mm_add_epi32(vacc2x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc3x0123 = _mm_add_epi32(vacc3x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -269,16 +256,16 @@
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c
index c06b1f3..2e985a2 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -180,21 +182,6 @@
             _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
           vacc3x0123 = _mm_add_epi32(vacc3x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc1x0123 = _mm_add_epi32(vacc1x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc2x0123 = _mm_add_epi32(vacc2x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc3x0123 = _mm_add_epi32(vacc3x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -269,16 +256,16 @@
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c
index 0d91bcb..81ed872 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -180,21 +182,6 @@
             _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
           vacc3x0123 = _mm_add_epi32(vacc3x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc1x0123 = _mm_add_epi32(vacc1x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc2x0123 = _mm_add_epi32(vacc2x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc3x0123 = _mm_add_epi32(vacc3x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -312,16 +299,16 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c
index e3c1141..053a947 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x4c2__ssse3_ld64(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -180,21 +182,6 @@
             _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
           vacc3x0123 = _mm_add_epi32(vacc3x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc1x0123 = _mm_add_epi32(vacc1x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc2x0123 = _mm_add_epi32(vacc2x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc3x0123 = _mm_add_epi32(vacc3x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -312,16 +299,16 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c
index 24f9fde..860d51b 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -185,21 +187,6 @@
             _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
           vacc3x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
-            vacc1x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
-            vacc2x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123);
-            vacc3x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123);
-          }
         }
       }
     }
@@ -274,16 +261,16 @@
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c
index a06cd6c..5f89488 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -185,21 +187,6 @@
             _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
           vacc3x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            vacc0x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
-            vacc1x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
-            vacc2x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123);
-            vacc3x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123);
-          }
         }
       }
     }
@@ -274,16 +261,16 @@
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-sse2.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-sse2.c
index 0b90db8..030105f 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-sse2.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-sse2.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse2(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -173,20 +175,6 @@
             _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
           vacc3x0123 = _mm_add_epi32(vacc3x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vxb3 = _mm_load_si128((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int16_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc1x0123 = _mm_add_epi32(vacc1x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc2x0123 = _mm_add_epi32(vacc2x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc3x0123 = _mm_add_epi32(vacc3x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -304,16 +292,16 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-sse41.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-sse41.c
index 100c932..d10630b 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-sse41.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-sse41.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -173,20 +175,6 @@
             _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
           vacc3x0123 = _mm_add_epi32(vacc3x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vxb3 = _mm_load_si128((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int16_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc1x0123 = _mm_add_epi32(vacc1x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc2x0123 = _mm_add_epi32(vacc2x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc3x0123 = _mm_add_epi32(vacc3x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -261,16 +249,16 @@
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-ssse3.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-ssse3.c
index 4ae78f7..637a1ca 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-ssse3.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-ssse3.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__ssse3(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -173,20 +175,6 @@
             _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
           vacc3x0123 = _mm_add_epi32(vacc3x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vxb3 = _mm_load_si128((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int16_t));
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc1x0123 = _mm_add_epi32(vacc1x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc2x0123 = _mm_add_epi32(vacc2x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc3x0123 = _mm_add_epi32(vacc3x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }
@@ -304,16 +292,16 @@
       vout = _mm_srli_si128(vout, 4);
       *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(vout);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-xop.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-xop.c
index b6275df..471b856 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-xop.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-xop.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -178,20 +180,6 @@
             _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
           vacc3x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const __m128i vxb3 = _mm_load_si128((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8 * sizeof(int16_t));
-
-            vacc0x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
-            vacc1x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
-            vacc2x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123);
-            vacc3x0123 = _mm_maddd_epi16(
-              _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123);
-          }
         }
       }
     }
@@ -266,16 +254,16 @@
       *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
       *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {

diff --git a/src/qs8-gemm/gen/4x8c16-minmax-neon-mlal-padal.c b/src/qs8-gemm/gen/4x8c16-minmax-neon-mlal-padal.c
index 76309f4..45b6a5e 100644
--- a/src/qs8-gemm/gen/4x8c16-minmax-neon-mlal-padal.c
+++ b/src/qs8-gemm/gen/4x8c16-minmax-neon-mlal-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 16);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -91,7 +92,7 @@
     int32x4_t vacc3x6 = vacc0x6;
     int32x4_t vacc3x7 = vacc0x7;
 
-    // KC loop of 16 with up to 15 remainder
+    // KC loop of 16
     size_t k = 0;
     while (k < kc) {
       const int8x16_t va0 = vld1q_s8(a0); a0 += 16;
@@ -359,10 +360,10 @@
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-      a1 = (const int8_t*) ((uintptr_t) a1 - k);
-      a2 = (const int8_t*) ((uintptr_t) a2 - k);
-      a3 = (const int8_t*) ((uintptr_t) a3 - k);
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
 
       nc -= 8;
     } else {

diff --git a/src/qs8-gemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c b/src/qs8-gemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c
index b7403d4..ddb72c8 100644
--- a/src/qs8-gemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c
+++ b/src/qs8-gemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -346,28 +347,6 @@
           vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
           const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
           vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-            const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-            const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-            const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-            const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
-            const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
-            const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-            vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
-            const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-            vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
-          }
         }
       }
     }

diff --git a/src/qs8-gemm/gen/4x8c2-minmax-neon-mull-padal-dup.c b/src/qs8-gemm/gen/4x8c2-minmax-neon-mull-padal-dup.c
index 95d7257..a7f10cd 100644
--- a/src/qs8-gemm/gen/4x8c2-minmax-neon-mull-padal-dup.c
+++ b/src/qs8-gemm/gen/4x8c2-minmax-neon-mull-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mull_padal_dup(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -220,28 +221,6 @@
           vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
           const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
           vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
-
-          if (k > 6 * sizeof(int8_t)) {
-            const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-            const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-            const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-            const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-            vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-            const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-            const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-            vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-            const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
-            const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-            vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
-            const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-            vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
-            const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-            vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
-          }
         }
       }
     }

diff --git a/src/qs8-gemm/gen/4x8c4-minmax-neondot.c b/src/qs8-gemm/gen/4x8c4-minmax-neondot.c
index ce6a0fb..d32b05e 100644
--- a/src/qs8-gemm/gen/4x8c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/4x8c4-minmax-neondot.c

@@ -7,12 +7,12 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-
 #include <assert.h>
 
 #include <arm_neon.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot(
@@ -30,7 +30,12 @@
   assert(mr <= 4);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -101,7 +106,7 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 7 final positions of `k`
+    // Handle up to 6 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 4x4 block of activations.
       const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
@@ -139,15 +144,8 @@
         vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
       }
     }
-    // End of accumulation loop. The variable `kc` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-    a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-    a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-    a3 = (const int8_t*) ((uintptr_t) a3 - kc);
 
     // Post-accumulation work
-
     const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
     const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
 
@@ -218,6 +216,11 @@
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+
       nc -= 8;
     } else {
       // Final case where not all of the 8 columns fit in the destination.

diff --git a/src/qs8-gemm/gen/4x8c8-minmax-neon-mull-padal.c b/src/qs8-gemm/gen/4x8c8-minmax-neon-mull-padal.c
index 08f5386..288eac7 100644
--- a/src/qs8-gemm/gen/4x8c8-minmax-neon-mull-padal.c
+++ b/src/qs8-gemm/gen/4x8c8-minmax-neon-mull-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal(
@@ -36,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -456,10 +457,10 @@
       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - (kc - k));
-      a1 = (const int8_t*) ((uintptr_t) a1 - (kc - k));
-      a2 = (const int8_t*) ((uintptr_t) a2 - (kc - k));
-      a3 = (const int8_t*) ((uintptr_t) a3 - (kc - k));
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
 
       nc -= 8;
     } else {

diff --git a/src/qs8-gemm/gen/6x16c4-minmax-neondot.c b/src/qs8-gemm/gen/6x16c4-minmax-neondot.c
index dc3c135..3a9277e 100644
--- a/src/qs8-gemm/gen/6x16c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/6x16c4-minmax-neondot.c

@@ -7,12 +7,12 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-
 #include <assert.h>
 
 #include <arm_neon.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_6x16c4__neondot(
@@ -30,7 +30,12 @@
   assert(mr <= 6);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -167,7 +172,7 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 7 final positions of `k`
+    // Handle up to 6 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 6x4 block of activations.
       const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
@@ -243,17 +248,8 @@
         vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb4567xCDEF, va5x01234567, 1);
       }
     }
-    // End of accumulation loop. The variable `kc` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-    a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-    a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-    a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-    a4 = (const int8_t*) ((uintptr_t) a4 - kc);
-    a5 = (const int8_t*) ((uintptr_t) a5 - kc);
 
     // Post-accumulation work
-
     const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
     const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
 
@@ -408,6 +404,13 @@
       c4 = (int8_t*) ((uintptr_t) c4 + cn_stride);
       c5 = (int8_t*) ((uintptr_t) c5 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+      a4 = (const int8_t*) ((uintptr_t) a4 - kc);
+      a5 = (const int8_t*) ((uintptr_t) a5 - kc);
+
       nc -= 16;
     } else {
       // Final case where not all of the 16 columns fit in the destination.

diff --git a/src/qs8-gemm/gen/6x8c4-minmax-neondot.c b/src/qs8-gemm/gen/6x8c4-minmax-neondot.c
index 61ee8a9..b1fc09f 100644
--- a/src/qs8-gemm/gen/6x8c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/6x8c4-minmax-neondot.c

@@ -7,12 +7,12 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-
 #include <assert.h>
 
 #include <arm_neon.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_6x8c4__neondot(
@@ -30,7 +30,12 @@
   assert(mr <= 6);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -127,7 +132,7 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 7 final positions of `k`
+    // Handle up to 6 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 6x4 block of activations.
       const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
@@ -175,17 +180,8 @@
         vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
       }
     }
-    // End of accumulation loop. The variable `kc` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-    a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-    a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-    a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-    a4 = (const int8_t*) ((uintptr_t) a4 - kc);
-    a5 = (const int8_t*) ((uintptr_t) a5 - kc);
 
     // Post-accumulation work
-
     const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
     const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
 
@@ -280,6 +276,13 @@
       c4 = (int8_t*) ((uintptr_t) c4 + cn_stride);
       c5 = (int8_t*) ((uintptr_t) c5 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+      a4 = (const int8_t*) ((uintptr_t) a4 - kc);
+      a5 = (const int8_t*) ((uintptr_t) a5 - kc);
+
       nc -= 8;
     } else {
       // Final case where not all of the 8 columns fit in the destination.

diff --git a/src/qs8-gemm/gen/8x16c4-minmax-neondot.c b/src/qs8-gemm/gen/8x16c4-minmax-neondot.c
index 666ef0d..93dab4c 100644
--- a/src/qs8-gemm/gen/8x16c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/8x16c4-minmax-neondot.c

@@ -7,12 +7,12 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-
 #include <assert.h>
 
 #include <arm_neon.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot(
@@ -30,7 +30,12 @@
   assert(mr <= 8);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -205,7 +210,7 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 7 final positions of `k`
+    // Handle up to 6 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 8x4 block of activations.
       const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
@@ -299,19 +304,8 @@
         vacc7xCDEF = vdotq_lane_s32(vacc7xCDEF, vb4567xCDEF, va7x01234567, 1);
       }
     }
-    // End of accumulation loop. The variable `kc` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-    a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-    a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-    a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-    a4 = (const int8_t*) ((uintptr_t) a4 - kc);
-    a5 = (const int8_t*) ((uintptr_t) a5 - kc);
-    a6 = (const int8_t*) ((uintptr_t) a6 - kc);
-    a7 = (const int8_t*) ((uintptr_t) a7 - kc);
 
     // Post-accumulation work
-
     const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
     const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
 
@@ -510,6 +504,15 @@
       c6 = (int8_t*) ((uintptr_t) c6 + cn_stride);
       c7 = (int8_t*) ((uintptr_t) c7 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+      a4 = (const int8_t*) ((uintptr_t) a4 - kc);
+      a5 = (const int8_t*) ((uintptr_t) a5 - kc);
+      a6 = (const int8_t*) ((uintptr_t) a6 - kc);
+      a7 = (const int8_t*) ((uintptr_t) a7 - kc);
+
       nc -= 16;
     } else {
       // Final case where not all of the 16 columns fit in the destination.

diff --git a/src/qs8-gemm/gen/8x8c4-minmax-neondot.c b/src/qs8-gemm/gen/8x8c4-minmax-neondot.c
index 159363b..6dbeb3a 100644
--- a/src/qs8-gemm/gen/8x8c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/8x8c4-minmax-neondot.c

@@ -7,12 +7,12 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-
 #include <assert.h>
 
 #include <arm_neon.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot(
@@ -30,7 +30,12 @@
   assert(mr <= 8);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
@@ -153,7 +158,7 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 7 final positions of `k`
+    // Handle up to 6 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 8x4 block of activations.
       const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
@@ -211,19 +216,8 @@
         vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
       }
     }
-    // End of accumulation loop. The variable `kc` contains the amount by which
-    // we advanced the `va` pointers, so we rewind by this amount now.
-    a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-    a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-    a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-    a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-    a4 = (const int8_t*) ((uintptr_t) a4 - kc);
-    a5 = (const int8_t*) ((uintptr_t) a5 - kc);
-    a6 = (const int8_t*) ((uintptr_t) a6 - kc);
-    a7 = (const int8_t*) ((uintptr_t) a7 - kc);
 
     // Post-accumulation work
-
     const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
     const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
 
@@ -342,6 +336,15 @@
       c6 = (int8_t*) ((uintptr_t) c6 + cn_stride);
       c7 = (int8_t*) ((uintptr_t) c7 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+      a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
+      a4 = (const int8_t*) ((uintptr_t) a4 - kc);
+      a5 = (const int8_t*) ((uintptr_t) a5 - kc);
+      a6 = (const int8_t*) ((uintptr_t) a6 - kc);
+      a7 = (const int8_t*) ((uintptr_t) a7 - kc);
+
       nc -= 8;
     } else {
       // Final case where not all of the 8 columns fit in the destination.

diff --git a/src/qs8-igemm/MRx16c8-avx512skx.c.in b/src/qs8-igemm/MRx16c8-avx512skx.c.in
index 0244207..af17147 100644
--- a/src/qs8-igemm/MRx16c8-avx512skx.c.in
+++ b/src/qs8-igemm/MRx16c8-avx512skx.c.in

@@ -12,6 +12,7 @@
 
 #include <xnnpack/igemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 $GEMM_SUFFIX = "_xw" if VARIANT == "EXTENDED" else ""
@@ -38,6 +39,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   $for M in range(1, MR):
     int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);

diff --git a/src/qs8-igemm/MRx4c2-sse.c.in b/src/qs8-igemm/MRx4c2-sse.c.in
index b917f66..d111fa1 100644
--- a/src/qs8-igemm/MRx4c2-sse.c.in
+++ b/src/qs8-igemm/MRx4c2-sse.c.in

@@ -18,6 +18,7 @@
   #include <${SSE_HEADER}>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 $ISA = {2: "sse2", 3: "ssse3", 4: "sse41", 5: "xop"}[SSE]
@@ -46,6 +47,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   $for M in range(1, MR):
     int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
@@ -180,20 +182,6 @@
               $else:
                 vacc${M}x0123 = _mm_add_epi32(vacc${M}x0123,
                   _mm_madd_epi16(_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              $for M in range(MR):
-                $if SSE == 5:
-                  vacc${M}x0123 = _mm_maddd_epi16(
-                    _mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc${M}x0123);
-                $else:
-                  vacc${M}x0123 = _mm_add_epi32(vacc${M}x0123,
-                    _mm_madd_epi16(_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/MRx4c8-sse.c.in b/src/qs8-igemm/MRx4c8-sse.c.in
index 6a9b5df..030d144 100644
--- a/src/qs8-igemm/MRx4c8-sse.c.in
+++ b/src/qs8-igemm/MRx4c8-sse.c.in

@@ -18,6 +18,7 @@
   #include <${SSE_HEADER}>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 $ISA = {2: "sse2", 3: "ssse3", 4: "sse41", 5: "xop"}[SSE]
@@ -46,6 +47,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   $for M in range(1, MR):
     int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);

diff --git a/src/qs8-igemm/MRx4c8-wasmsimd.c.in b/src/qs8-igemm/MRx4c8-wasmsimd.c.in
index 51dc5cb..77282fc 100644
--- a/src/qs8-igemm/MRx4c8-wasmsimd.c.in
+++ b/src/qs8-igemm/MRx4c8-wasmsimd.c.in

@@ -10,6 +10,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 $LOAD_SUFFIX = {"LD128": "_ld128", "LD64": "_ld64", "EXTENDED": ""}[VARIANT]
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   $for M in range(1, MR):
     int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);

diff --git a/src/qs8-igemm/MRx8c8-avx2.c.in b/src/qs8-igemm/MRx8c8-avx2.c.in
index f30a5e8..ed18162 100644
--- a/src/qs8-igemm/MRx8c8-avx2.c.in
+++ b/src/qs8-igemm/MRx8c8-avx2.c.in

@@ -10,6 +10,7 @@
 
 #include <xnnpack/igemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_${MR}x8c8__avx2(
@@ -37,6 +38,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   $for M in range(1, MR):
     int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);

diff --git a/src/qs8-igemm/MRxNRc4-neondot.c.in b/src/qs8-igemm/MRxNRc4-neondot.c.in
index 00b0382..e49b634 100644
--- a/src/qs8-igemm/MRxNRc4-neondot.c.in
+++ b/src/qs8-igemm/MRxNRc4-neondot.c.in

@@ -10,8 +10,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_${MR}x${NR}c4__neondot(
@@ -39,6 +39,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   int8_t* c0 = c;
   $for M in range(1, MR):
     int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
@@ -92,7 +93,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 7 final positions of `k`
+      // Handle up to 6 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a ${MR}x4 block of activations.
         $for M in range(MR):

diff --git a/src/qs8-igemm/c16-neon-mlal-padal.c.in b/src/qs8-igemm/c16-neon-mlal-padal.c.in
index f2202a4..1d95ae3 100644
--- a/src/qs8-igemm/c16-neon-mlal-padal.c.in
+++ b/src/qs8-igemm/c16-neon-mlal-padal.c.in

@@ -10,8 +10,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_${MR}x${NR}c16__neon_mlal_padal(
@@ -39,6 +39,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 16);
   int8_t* c0 = c;
   $for M in range(1, MR):
     int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);

diff --git a/src/qs8-igemm/c2-neon-mull-padal-dup.c.in b/src/qs8-igemm/c2-neon-mull-padal-dup.c.in
index 08dc2a0..79e350b 100644
--- a/src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+++ b/src/qs8-igemm/c2-neon-mull-padal-dup.c.in

@@ -10,8 +10,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_${MR}x${NR}c2__neon_${"mlal" if MLA else "mull"}_padal_dup(
@@ -39,6 +39,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   $for M in range(1, MR):
     int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
@@ -143,16 +144,6 @@
               $for N in range(0, NR, 4):
                 const int16x8_t vprod${M}x${ABC[N:N+4]}c2 = vmull_s8(vb${ABC[N:N+4]}c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), 2)));
                 vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c2);
-
-            if (k > 6 * sizeof(int8_t)) {
-              $for N in range(0, NR, 4):
-                const int8x8_t vb${ABC[N:N+4]}c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-              $for M in range(MR):
-                $for N in range(0, NR, 4):
-                  const int16x8_t vprod${M}x${ABC[N:N+4]}c3 = vmull_s8(vb${ABC[N:N+4]}c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), 3)));
-                  vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c3);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/c8-neon-mull-padal.c.in b/src/qs8-igemm/c8-neon-mull-padal.c.in
index 26c8336..66b17b1 100644
--- a/src/qs8-igemm/c8-neon-mull-padal.c.in
+++ b/src/qs8-igemm/c8-neon-mull-padal.c.in

@@ -10,8 +10,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_${MR}x${NR}c8__neon_mull_padal(
@@ -39,6 +39,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   $for M in range(1, MR):
     int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
@@ -93,9 +94,6 @@
         k -= 16 * sizeof(int8_t);
       }
       // Handle up to 8 final positions of `k`
-      // If kc was 0 or 16, there is no remainder.  k is 0.
-      // If kc was 1 to 8,  there is a remainder of k.
-      // If kc was 9 to 15, the main loop handled the remainder; k underflowed.
       if XNN_UNLIKELY(k > 0) {
         $for M in range(MR):
           const int8x8_t va${M} = vld1_s8(a${M});

diff --git a/src/qs8-igemm/gen/12x8c4-minmax-neondot.c b/src/qs8-igemm/gen/12x8c4-minmax-neondot.c
index d6c5eca..7a74253 100644
--- a/src/qs8-igemm/gen/12x8c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/12x8c4-minmax-neondot.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_12x8c4__neondot(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -240,7 +241,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 7 final positions of `k`
+      // Handle up to 6 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 12x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);

diff --git a/src/qs8-igemm/gen/1x16c16-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/1x16c16-minmax-neon-mlal-padal.c
index 293022a..302733a 100644
--- a/src/qs8-igemm/gen/1x16c16-minmax-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/1x16c16-minmax-neon-mlal-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 16);
   int8_t* c0 = c;
 
   do {

diff --git a/src/qs8-igemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c
index 21f0a45..84cc0e7 100644
--- a/src/qs8-igemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c
+++ b/src/qs8-igemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
 
   do {
@@ -249,22 +250,6 @@
             vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
             const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
             vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-              const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-              const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-              const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
-              const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/1x16c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/1x16c2-minmax-neon-mull-padal-dup.c
index f384462..9ae6fc7 100644
--- a/src/qs8-igemm/gen/1x16c2-minmax-neon-mull-padal-dup.c
+++ b/src/qs8-igemm/gen/1x16c2-minmax-neon-mull-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
 
   do {
@@ -161,22 +162,6 @@
             vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
             const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
             vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-              const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-              const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-              const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
-              const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/1x16c4-minmax-neondot.c b/src/qs8-igemm/gen/1x16c4-minmax-neondot.c
index fb5c186..a72ae5b 100644
--- a/src/qs8-igemm/gen/1x16c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/1x16c4-minmax-neondot.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   int8_t* c0 = c;
 
   do {
@@ -85,7 +86,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 7 final positions of `k`
+      // Handle up to 6 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 1x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);

diff --git a/src/qs8-igemm/gen/1x16c8-minmax-avx512skx.c b/src/qs8-igemm/gen/1x16c8-minmax-avx512skx.c
index 924734f..34e6a19 100644
--- a/src/qs8-igemm/gen/1x16c8-minmax-avx512skx.c
+++ b/src/qs8-igemm/gen/1x16c8-minmax-avx512skx.c

@@ -13,6 +13,7 @@
 
 #include <xnnpack/igemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx(
@@ -38,6 +39,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   const __mmask16 vbias_mask = _cvtu32_mask16(0x1111);

diff --git a/src/qs8-igemm/gen/1x16c8-minmax-neon-mull-padal.c b/src/qs8-igemm/gen/1x16c8-minmax-neon-mull-padal.c
index ca428ce..811cb56 100644
--- a/src/qs8-igemm/gen/1x16c8-minmax-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/1x16c8-minmax-neon-mull-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   do {
@@ -159,9 +160,6 @@
         k -= 16 * sizeof(int8_t);
       }
       // Handle up to 8 final positions of `k`
-      // If kc was 0 or 16, there is no remainder.  k is 0.
-      // If kc was 1 to 8,  there is a remainder of k.
-      // If kc was 9 to 15, the main loop handled the remainder; k underflowed.
       if XNN_UNLIKELY(k > 0) {
         const int8x8_t va0 = vld1_s8(a0);
 

diff --git a/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld128.c
index 42dc29e..cdff4d5 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld128.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c2__sse2_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
 
   do {
@@ -110,15 +112,6 @@
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld64.c
index d78c425..bd6ce42 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-sse2-ld64.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c2__sse2_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
 
   do {
@@ -110,15 +112,6 @@
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld128.c
index 0d5b1dc..312afa9 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld128.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
 
   do {
@@ -110,15 +112,6 @@
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld64.c
index 2827d03..af2b116 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-sse41-ld64.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
 
   do {
@@ -110,15 +112,6 @@
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld128.c
index b9aba97..5f308dd 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld128.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c2__ssse3_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
 
   do {
@@ -110,15 +112,6 @@
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld64.c
index 862a3f6..ff8241a 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld64.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c2__ssse3_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
 
   do {
@@ -110,15 +112,6 @@
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/1x4c2-minmax-xop-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-xop-ld128.c
index 5882450..fe4d345 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-xop-ld128.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld128(
@@ -44,6 +45,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
 
   do {
@@ -115,15 +117,6 @@
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_maddd_epi16(
-                _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/1x4c2-minmax-xop-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-xop-ld64.c
index 5aafdf0..7417f8e 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-xop-ld64.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c2__xop_ld64(
@@ -44,6 +45,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
 
   do {
@@ -115,15 +117,6 @@
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_maddd_epi16(
-                _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld128.c
index fe9eb8c..6837212 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld128.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   do {

diff --git a/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld64.c
index 2dd1d31..fc91529 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld64.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   do {

diff --git a/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld128.c
index 10b9d02..66f8bbb 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld128.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   do {

diff --git a/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld64.c
index c171ead..4031b08 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld64.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   do {

diff --git a/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld128.c
index 561d72f..b04f5bf 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld128.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__ssse3_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   do {

diff --git a/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld64.c
index f95561d..6036ab6 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld64.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__ssse3_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   do {

diff --git a/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld128.c
index 8a104c0..9af046a 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld128.c

@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   const v128_t vzero = wasm_f64x2_splat(0.0);

diff --git a/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld64.c
index 15bc8bb..faadba9 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld64.c

@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   const v128_t vzero = wasm_f64x2_splat(0.0);

diff --git a/src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c
index 0686d4e..aa7eca9 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128(
@@ -44,6 +45,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   do {

diff --git a/src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c
index fb46c83..1204750 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64(
@@ -44,6 +45,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   do {

diff --git a/src/qs8-igemm/gen/1x8c16-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/1x8c16-minmax-neon-mlal-padal.c
index de8a413..e81fbe7 100644
--- a/src/qs8-igemm/gen/1x8c16-minmax-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/1x8c16-minmax-neon-mlal-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 16);
   int8_t* c0 = c;
 
   do {

diff --git a/src/qs8-igemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c
index 0f4c1cf..ccd894e 100644
--- a/src/qs8-igemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c
+++ b/src/qs8-igemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
 
   do {
@@ -165,16 +166,6 @@
             vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
             const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
             vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-              const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-              const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/1x8c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/1x8c2-minmax-neon-mull-padal-dup.c
index d8d8367..c2f45d1 100644
--- a/src/qs8-igemm/gen/1x8c2-minmax-neon-mull-padal-dup.c
+++ b/src/qs8-igemm/gen/1x8c2-minmax-neon-mull-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
 
   do {
@@ -117,16 +118,6 @@
             vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
             const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
             vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-              const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-              const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/1x8c4-minmax-neondot.c b/src/qs8-igemm/gen/1x8c4-minmax-neondot.c
index 07c1ad1..ed42605 100644
--- a/src/qs8-igemm/gen/1x8c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/1x8c4-minmax-neondot.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   int8_t* c0 = c;
 
   do {
@@ -75,7 +76,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 7 final positions of `k`
+      // Handle up to 6 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 1x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);

diff --git a/src/qs8-igemm/gen/1x8c8-minmax-avx2.c b/src/qs8-igemm/gen/1x8c8-minmax-avx2.c
index 4eaa400..4c41935 100644
--- a/src/qs8-igemm/gen/1x8c8-minmax-avx2.c
+++ b/src/qs8-igemm/gen/1x8c8-minmax-avx2.c

@@ -13,6 +13,7 @@
 
 #include <xnnpack/igemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   do {

diff --git a/src/qs8-igemm/gen/1x8c8-minmax-neon-mull-padal.c b/src/qs8-igemm/gen/1x8c8-minmax-neon-mull-padal.c
index 86e4775..d4c86c9 100644
--- a/src/qs8-igemm/gen/1x8c8-minmax-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/1x8c8-minmax-neon-mull-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mull_padal(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   do {
@@ -111,9 +112,6 @@
         k -= 16 * sizeof(int8_t);
       }
       // Handle up to 8 final positions of `k`
-      // If kc was 0 or 16, there is no remainder.  k is 0.
-      // If kc was 1 to 8,  there is a remainder of k.
-      // If kc was 9 to 15, the main loop handled the remainder; k underflowed.
       if XNN_UNLIKELY(k > 0) {
         const int8x8_t va0 = vld1_s8(a0);
 

diff --git a/src/qs8-igemm/gen/2x16c16-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/2x16c16-minmax-neon-mlal-padal.c
index 293e5e5..db2455b 100644
--- a/src/qs8-igemm/gen/2x16c16-minmax-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/2x16c16-minmax-neon-mlal-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 16);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {

diff --git a/src/qs8-igemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c
index 8fa5b3c..d35ce3f 100644
--- a/src/qs8-igemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c
+++ b/src/qs8-igemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {
@@ -369,30 +370,6 @@
             vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
             const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
             vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-              const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-              const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-              const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
-              const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
-              const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-              const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-              const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
-              const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/2x16c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/2x16c2-minmax-neon-mull-padal-dup.c
index 867c6c0..b13ce66 100644
--- a/src/qs8-igemm/gen/2x16c2-minmax-neon-mull-padal-dup.c
+++ b/src/qs8-igemm/gen/2x16c2-minmax-neon-mull-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {
@@ -231,30 +232,6 @@
             vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
             const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
             vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-              const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-              const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-              const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
-              const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
-              const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-              const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-              const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
-              const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/2x16c8-minmax-avx512skx.c b/src/qs8-igemm/gen/2x16c8-minmax-avx512skx.c
index 347ed78..328bc55 100644
--- a/src/qs8-igemm/gen/2x16c8-minmax-avx512skx.c
+++ b/src/qs8-igemm/gen/2x16c8-minmax-avx512skx.c

@@ -13,6 +13,7 @@
 
 #include <xnnpack/igemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x16c8__avx512skx(
@@ -38,6 +39,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {

diff --git a/src/qs8-igemm/gen/2x16c8-minmax-neon-mull-padal.c b/src/qs8-igemm/gen/2x16c8-minmax-neon-mull-padal.c
index 2595047..c6cd3ef 100644
--- a/src/qs8-igemm/gen/2x16c8-minmax-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/2x16c8-minmax-neon-mull-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {
@@ -233,9 +234,6 @@
         k -= 16 * sizeof(int8_t);
       }
       // Handle up to 8 final positions of `k`
-      // If kc was 0 or 16, there is no remainder.  k is 0.
-      // If kc was 1 to 8,  there is a remainder of k.
-      // If kc was 9 to 15, the main loop handled the remainder; k underflowed.
       if XNN_UNLIKELY(k > 0) {
         const int8x8_t va0 = vld1_s8(a0);
         const int8x8_t va1 = vld1_s8(a1);

diff --git a/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld128.c
index 906861e..c771d14 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld128.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x4c8__sse2_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {

diff --git a/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld64.c
index 3a728bd..e49603b 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld64.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x4c8__sse2_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {

diff --git a/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld128.c
index 6d68d36..e5e18fd 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld128.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {

diff --git a/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld64.c
index c7b7392..09d8a27 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld64.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x4c8__sse41_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {

diff --git a/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld128.c
index 2b1d86a..d10dc86 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld128.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x4c8__ssse3_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {

diff --git a/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld64.c
index 15e19f9..2b97a3c 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld64.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x4c8__ssse3_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {

diff --git a/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld128.c
index 11188d4..3aa0434 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld128.c

@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {

diff --git a/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld64.c
index a85d4d7..d2675e6 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-wasmsimd-ld64.c

@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {

diff --git a/src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c
index 00d6188..e58f425 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128(
@@ -44,6 +45,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {

diff --git a/src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c
index 70458bd..02bbdf4 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64(
@@ -44,6 +45,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {

diff --git a/src/qs8-igemm/gen/2x8c16-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/2x8c16-minmax-neon-mlal-padal.c
index c95128a..bc9ba0e 100644
--- a/src/qs8-igemm/gen/2x8c16-minmax-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/2x8c16-minmax-neon-mlal-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 16);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {

diff --git a/src/qs8-igemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c
index 478e2c5..f48331d 100644
--- a/src/qs8-igemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c
+++ b/src/qs8-igemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {
@@ -231,20 +232,6 @@
             vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
             const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
             vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-              const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-              const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-              const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-              const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/2x8c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/2x8c2-minmax-neon-mull-padal-dup.c
index 03fa52d..5178132 100644
--- a/src/qs8-igemm/gen/2x8c2-minmax-neon-mull-padal-dup.c
+++ b/src/qs8-igemm/gen/2x8c2-minmax-neon-mull-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {
@@ -157,20 +158,6 @@
             vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
             const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
             vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-              const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-              const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-              const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-              const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/2x8c8-minmax-avx2.c b/src/qs8-igemm/gen/2x8c8-minmax-avx2.c
index d3060c3..9291845 100644
--- a/src/qs8-igemm/gen/2x8c8-minmax-avx2.c
+++ b/src/qs8-igemm/gen/2x8c8-minmax-avx2.c

@@ -13,6 +13,7 @@
 
 #include <xnnpack/igemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {

diff --git a/src/qs8-igemm/gen/2x8c8-minmax-neon-mull-padal.c b/src/qs8-igemm/gen/2x8c8-minmax-neon-mull-padal.c
index 9da18a4..875fe8e 100644
--- a/src/qs8-igemm/gen/2x8c8-minmax-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/2x8c8-minmax-neon-mull-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr != 2) {
@@ -153,9 +154,6 @@
         k -= 16 * sizeof(int8_t);
       }
       // Handle up to 8 final positions of `k`
-      // If kc was 0 or 16, there is no remainder.  k is 0.
-      // If kc was 1 to 8,  there is a remainder of k.
-      // If kc was 9 to 15, the main loop handled the remainder; k underflowed.
       if XNN_UNLIKELY(k > 0) {
         const int8x8_t va0 = vld1_s8(a0);
         const int8x8_t va1 = vld1_s8(a1);

diff --git a/src/qs8-igemm/gen/3x16c16-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/3x16c16-minmax-neon-mlal-padal.c
index 70f26b2..2b9bfae 100644
--- a/src/qs8-igemm/gen/3x16c16-minmax-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/3x16c16-minmax-neon-mlal-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 16);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {

diff --git a/src/qs8-igemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c
index 6d304e2..5fa8f78 100644
--- a/src/qs8-igemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c
+++ b/src/qs8-igemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -489,38 +490,6 @@
             vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
             const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
             vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-              const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-              const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-              const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
-              const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
-              const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-              const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-              const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
-              const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
-              const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
-              const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
-              const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
-              const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/3x16c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/3x16c2-minmax-neon-mull-padal-dup.c
index 53161f3..c971f66 100644
--- a/src/qs8-igemm/gen/3x16c2-minmax-neon-mull-padal-dup.c
+++ b/src/qs8-igemm/gen/3x16c2-minmax-neon-mull-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -301,38 +302,6 @@
             vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
             const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
             vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-              const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-              const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-              const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
-              const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
-              const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-              const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-              const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
-              const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
-              const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
-              const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
-              const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
-              const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/3x16c8-minmax-avx512skx.c b/src/qs8-igemm/gen/3x16c8-minmax-avx512skx.c
index 3fe63e4..f9708b7 100644
--- a/src/qs8-igemm/gen/3x16c8-minmax-avx512skx.c
+++ b/src/qs8-igemm/gen/3x16c8-minmax-avx512skx.c

@@ -13,6 +13,7 @@
 
 #include <xnnpack/igemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x16c8__avx512skx(
@@ -38,6 +39,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {

diff --git a/src/qs8-igemm/gen/3x16c8-minmax-neon-mull-padal.c b/src/qs8-igemm/gen/3x16c8-minmax-neon-mull-padal.c
index 97fd9d7..9a305da 100644
--- a/src/qs8-igemm/gen/3x16c8-minmax-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/3x16c8-minmax-neon-mull-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -307,9 +308,6 @@
         k -= 16 * sizeof(int8_t);
       }
       // Handle up to 8 final positions of `k`
-      // If kc was 0 or 16, there is no remainder.  k is 0.
-      // If kc was 1 to 8,  there is a remainder of k.
-      // If kc was 9 to 15, the main loop handled the remainder; k underflowed.
       if XNN_UNLIKELY(k > 0) {
         const int8x8_t va0 = vld1_s8(a0);
         const int8x8_t va1 = vld1_s8(a1);

diff --git a/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld128.c
index dd940c0..47dba04 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld128.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x4c8__sse2_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {

diff --git a/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld64.c
index 4904a1c..eaffe04 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-sse2-ld64.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x4c8__sse2_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {

diff --git a/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld128.c
index 14d7f8c..b4c8d56 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld128.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {

diff --git a/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld64.c
index 5450387..6092d91 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-sse41-ld64.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {

diff --git a/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld128.c
index 3f3aefa..f428f76 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld128.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x4c8__ssse3_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {

diff --git a/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld64.c
index 8cca813..141be70 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-ssse3-ld64.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x4c8__ssse3_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {

diff --git a/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld128.c
index 0c95932..f232658 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld128.c

@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {

diff --git a/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld64.c
index 7f22e3d..1b9efcc 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-wasmsimd-ld64.c

@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {

diff --git a/src/qs8-igemm/gen/3x4c8-minmax-xop-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-xop-ld128.c
index 1760fc0..4adc8c1 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-xop-ld128.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128(
@@ -44,6 +45,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {

diff --git a/src/qs8-igemm/gen/3x4c8-minmax-xop-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-xop-ld64.c
index c215aa9..6bb8aaa 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-xop-ld64.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64(
@@ -44,6 +45,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {

diff --git a/src/qs8-igemm/gen/3x8c16-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/3x8c16-minmax-neon-mlal-padal.c
index 17ddc35..626dfa4 100644
--- a/src/qs8-igemm/gen/3x8c16-minmax-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/3x8c16-minmax-neon-mlal-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 16);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {

diff --git a/src/qs8-igemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c
index fac3944..bb5b54f 100644
--- a/src/qs8-igemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c
+++ b/src/qs8-igemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -297,24 +298,6 @@
             vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
             const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
             vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-              const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-              const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-              const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-              const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-              const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
-              const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/3x8c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/3x8c2-minmax-neon-mull-padal-dup.c
index 3f7f6b3..0177c18 100644
--- a/src/qs8-igemm/gen/3x8c2-minmax-neon-mull-padal-dup.c
+++ b/src/qs8-igemm/gen/3x8c2-minmax-neon-mull-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -197,24 +198,6 @@
             vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
             const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
             vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-              const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-              const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-              const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-              const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-              const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
-              const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/3x8c8-minmax-avx2.c b/src/qs8-igemm/gen/3x8c8-minmax-avx2.c
index 4269041..231c884 100644
--- a/src/qs8-igemm/gen/3x8c8-minmax-avx2.c
+++ b/src/qs8-igemm/gen/3x8c8-minmax-avx2.c

@@ -13,6 +13,7 @@
 
 #include <xnnpack/igemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2(
@@ -40,6 +41,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {

diff --git a/src/qs8-igemm/gen/3x8c8-minmax-neon-mull-padal.c b/src/qs8-igemm/gen/3x8c8-minmax-neon-mull-padal.c
index 0e70a8b..5c7810d 100644
--- a/src/qs8-igemm/gen/3x8c8-minmax-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/3x8c8-minmax-neon-mull-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -195,9 +196,6 @@
         k -= 16 * sizeof(int8_t);
       }
       // Handle up to 8 final positions of `k`
-      // If kc was 0 or 16, there is no remainder.  k is 0.
-      // If kc was 1 to 8,  there is a remainder of k.
-      // If kc was 9 to 15, the main loop handled the remainder; k underflowed.
       if XNN_UNLIKELY(k > 0) {
         const int8x8_t va0 = vld1_s8(a0);
         const int8x8_t va1 = vld1_s8(a1);

diff --git a/src/qs8-igemm/gen/4x16c16-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/4x16c16-minmax-neon-mlal-padal.c
index 8368cd2..df63cae 100644
--- a/src/qs8-igemm/gen/4x16c16-minmax-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/4x16c16-minmax-neon-mlal-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 16);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {

diff --git a/src/qs8-igemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c
index f6d24d2..ef26e6a 100644
--- a/src/qs8-igemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c
+++ b/src/qs8-igemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -609,46 +610,6 @@
             vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
             const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
             vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-              const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-              const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-              const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
-              const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
-              const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-              const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-              const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
-              const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
-              const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
-              const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
-              const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
-              const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
-              const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-              vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
-              const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-              vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
-              const int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-              vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
-              const int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-              vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/4x16c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/4x16c2-minmax-neon-mull-padal-dup.c
index d7a469b..6b8b47e 100644
--- a/src/qs8-igemm/gen/4x16c2-minmax-neon-mull-padal-dup.c
+++ b/src/qs8-igemm/gen/4x16c2-minmax-neon-mull-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -371,46 +372,6 @@
             vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
             const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
             vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-              const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-              const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-              const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
-              const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
-              const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-              const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-              const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
-              const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
-              const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
-              const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
-              const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
-              const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
-              const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-              vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
-              const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-              vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
-              const int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-              vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
-              const int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-              vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/4x16c4-minmax-neondot.c b/src/qs8-igemm/gen/4x16c4-minmax-neondot.c
index a315841..95dde6d 100644
--- a/src/qs8-igemm/gen/4x16c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/4x16c4-minmax-neondot.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -148,7 +149,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 7 final positions of `k`
+      // Handle up to 6 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 4x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);

diff --git a/src/qs8-igemm/gen/4x16c8-minmax-avx512skx.c b/src/qs8-igemm/gen/4x16c8-minmax-avx512skx.c
index b16ed2c..4f724c4 100644
--- a/src/qs8-igemm/gen/4x16c8-minmax-avx512skx.c
+++ b/src/qs8-igemm/gen/4x16c8-minmax-avx512skx.c

@@ -13,6 +13,7 @@
 
 #include <xnnpack/igemm.h>
 #include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx(
@@ -38,6 +39,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {

diff --git a/src/qs8-igemm/gen/4x16c8-minmax-neon-mull-padal.c b/src/qs8-igemm/gen/4x16c8-minmax-neon-mull-padal.c
index 9430f21..b03befc 100644
--- a/src/qs8-igemm/gen/4x16c8-minmax-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/4x16c8-minmax-neon-mull-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -381,9 +382,6 @@
         k -= 16 * sizeof(int8_t);
       }
       // Handle up to 8 final positions of `k`
-      // If kc was 0 or 16, there is no remainder.  k is 0.
-      // If kc was 1 to 8,  there is a remainder of k.
-      // If kc was 9 to 15, the main loop handled the remainder; k underflowed.
       if XNN_UNLIKELY(k > 0) {
         const int8x8_t va0 = vld1_s8(a0);
         const int8x8_t va1 = vld1_s8(a1);

diff --git a/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld128.c
index 89c7807..012a594 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld128.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -197,21 +199,6 @@
               _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
             vacc3x0123 = _mm_add_epi32(vacc3x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc1x0123 = _mm_add_epi32(vacc1x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc2x0123 = _mm_add_epi32(vacc2x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc3x0123 = _mm_add_epi32(vacc3x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld64.c
index 7b5ce73..3d83c87 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-sse2-ld64.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x4c2__sse2_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -197,21 +199,6 @@
               _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
             vacc3x0123 = _mm_add_epi32(vacc3x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc1x0123 = _mm_add_epi32(vacc1x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc2x0123 = _mm_add_epi32(vacc2x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc3x0123 = _mm_add_epi32(vacc3x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld128.c
index 6fc97cc..d0a7c2a 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld128.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -197,21 +199,6 @@
               _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
             vacc3x0123 = _mm_add_epi32(vacc3x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc1x0123 = _mm_add_epi32(vacc1x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc2x0123 = _mm_add_epi32(vacc2x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc3x0123 = _mm_add_epi32(vacc3x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld64.c
index 4978b7b..9c34aa3 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-sse41-ld64.c

@@ -12,6 +12,7 @@
 #include <smmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -197,21 +199,6 @@
               _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
             vacc3x0123 = _mm_add_epi32(vacc3x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc1x0123 = _mm_add_epi32(vacc1x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc2x0123 = _mm_add_epi32(vacc2x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc3x0123 = _mm_add_epi32(vacc3x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld128.c
index 82cfd1f..9c1304f 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld128.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld128(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -197,21 +199,6 @@
               _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
             vacc3x0123 = _mm_add_epi32(vacc3x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc1x0123 = _mm_add_epi32(vacc1x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc2x0123 = _mm_add_epi32(vacc2x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc3x0123 = _mm_add_epi32(vacc3x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld64.c
index 8a2def1..2d56cdf 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld64.c

@@ -12,6 +12,7 @@
 #include <tmmintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x4c2__ssse3_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -197,21 +199,6 @@
               _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
             vacc3x0123 = _mm_add_epi32(vacc3x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc1x0123 = _mm_add_epi32(vacc1x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc2x0123 = _mm_add_epi32(vacc2x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc3x0123 = _mm_add_epi32(vacc3x0123,
-                _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/4x4c2-minmax-xop-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-xop-ld128.c
index b1b6880..e6faba2 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-xop-ld128.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128(
@@ -44,6 +45,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -202,21 +204,6 @@
               _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
             vacc3x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_maddd_epi16(
-                _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
-              vacc1x0123 = _mm_maddd_epi16(
-                _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
-              vacc2x0123 = _mm_maddd_epi16(
-                _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123);
-              vacc3x0123 = _mm_maddd_epi16(
-                _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/4x4c2-minmax-xop-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-xop-ld64.c
index 7f3c778..0253dfa 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-xop-ld64.c

@@ -17,6 +17,7 @@
 #endif
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64(
@@ -44,6 +45,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -202,21 +204,6 @@
               _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
             vacc3x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              w = (const void*) ((uintptr_t) w + 8);
-              const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
-              vacc0x0123 = _mm_maddd_epi16(
-                _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
-              vacc1x0123 = _mm_maddd_epi16(
-                _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
-              vacc2x0123 = _mm_maddd_epi16(
-                _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123);
-              vacc3x0123 = _mm_maddd_epi16(
-                _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/4x8c16-minmax-neon-mlal-padal.c b/src/qs8-igemm/gen/4x8c16-minmax-neon-mlal-padal.c
index 9486381..ef871fb 100644
--- a/src/qs8-igemm/gen/4x8c16-minmax-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/4x8c16-minmax-neon-mlal-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 16);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {

diff --git a/src/qs8-igemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c
index d135354..80aa9ac 100644
--- a/src/qs8-igemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c
+++ b/src/qs8-igemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -363,28 +364,6 @@
             vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
             const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
             vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-              const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-              const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-              const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-              const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-              const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
-              const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
-              const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-              vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
-              const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-              vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/4x8c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/4x8c2-minmax-neon-mull-padal-dup.c
index 8afdcf0..5f635fa 100644
--- a/src/qs8-igemm/gen/4x8c2-minmax-neon-mull-padal-dup.c
+++ b/src/qs8-igemm/gen/4x8c2-minmax-neon-mull-padal-dup.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -237,28 +238,6 @@
             vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
             const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
             vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
-
-            if (k > 6 * sizeof(int8_t)) {
-              const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-              const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
-
-              const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
-              const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
-              vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
-              const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
-              const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
-              vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
-              const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
-              const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
-              vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
-              const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-              vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
-              const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
-              vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
-            }
           }
         }
       }

diff --git a/src/qs8-igemm/gen/4x8c4-minmax-neondot.c b/src/qs8-igemm/gen/4x8c4-minmax-neondot.c
index b7f6871..5758e21 100644
--- a/src/qs8-igemm/gen/4x8c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/4x8c4-minmax-neondot.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -120,7 +121,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 7 final positions of `k`
+      // Handle up to 6 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 4x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);

diff --git a/src/qs8-igemm/gen/4x8c8-minmax-neon-mull-padal.c b/src/qs8-igemm/gen/4x8c8-minmax-neon-mull-padal.c
index 594adc9..ae06019 100644
--- a/src/qs8-igemm/gen/4x8c8-minmax-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/4x8c8-minmax-neon-mull-padal.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -237,9 +238,6 @@
         k -= 16 * sizeof(int8_t);
       }
       // Handle up to 8 final positions of `k`
-      // If kc was 0 or 16, there is no remainder.  k is 0.
-      // If kc was 1 to 8,  there is a remainder of k.
-      // If kc was 9 to 15, the main loop handled the remainder; k underflowed.
       if XNN_UNLIKELY(k > 0) {
         const int8x8_t va0 = vld1_s8(a0);
         const int8x8_t va1 = vld1_s8(a1);

diff --git a/src/qs8-igemm/gen/6x16c4-minmax-neondot.c b/src/qs8-igemm/gen/6x16c4-minmax-neondot.c
index b013d4b..ab885b5 100644
--- a/src/qs8-igemm/gen/6x16c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/6x16c4-minmax-neondot.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_6x16c4__neondot(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -190,7 +191,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 7 final positions of `k`
+      // Handle up to 6 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 6x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);

diff --git a/src/qs8-igemm/gen/6x8c4-minmax-neondot.c b/src/qs8-igemm/gen/6x8c4-minmax-neondot.c
index 13bb060..b4c3bf2 100644
--- a/src/qs8-igemm/gen/6x8c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/6x8c4-minmax-neondot.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -150,7 +151,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 7 final positions of `k`
+      // Handle up to 6 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 6x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);

diff --git a/src/qs8-igemm/gen/8x16c4-minmax-neondot.c b/src/qs8-igemm/gen/8x16c4-minmax-neondot.c
index 40f8911..001ce33 100644
--- a/src/qs8-igemm/gen/8x16c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/8x16c4-minmax-neondot.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -232,7 +233,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 7 final positions of `k`
+      // Handle up to 6 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 8x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);

diff --git a/src/qs8-igemm/gen/8x8c4-minmax-neondot.c b/src/qs8-igemm/gen/8x8c4-minmax-neondot.c
index d8f4223..9ce9937 100644
--- a/src/qs8-igemm/gen/8x8c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/8x8c4-minmax-neondot.c

@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include <xnnpack/common.h>
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot(
@@ -40,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 4);
   int8_t* c0 = c;
   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -180,7 +181,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 7 final positions of `k`
+      // Handle up to 6 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 8x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);

diff --git a/src/qu8-gemm/2x4c8-minmax-sse2.c b/src/qu8-gemm/2x4c8-minmax-sse2.c
index 371fe74..e82204d 100644
--- a/src/qu8-gemm/2x4c8-minmax-sse2.c
+++ b/src/qu8-gemm/2x4c8-minmax-sse2.c

@@ -48,7 +48,12 @@
   assert(mr <= 2);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const uint8_t* a0 = a;
   uint8_t* c0 = c;
   const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
@@ -58,7 +63,6 @@
     c1 = c0;
   }
 
-  const size_t kc_stride = round_up_po2(kc, 8);
   const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->sse2.kernel_zero_point);
 
   do {
@@ -173,8 +177,8 @@
       *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
       *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_srli_epi64(vout, 32));
 
-      a0 = (const uint8_t*) ((uintptr_t) a0 - kc_stride);
-      a1 = (const uint8_t*) ((uintptr_t) a1 - kc_stride);
+      a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
+      a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
 
       c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
       c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);

diff --git a/src/qu8-gemm/4x4c2-minmax-sse2.c b/src/qu8-gemm/4x4c2-minmax-sse2.c
index 45afdd2..6f687ce 100644
--- a/src/qu8-gemm/4x4c2-minmax-sse2.c
+++ b/src/qu8-gemm/4x4c2-minmax-sse2.c

@@ -11,6 +11,7 @@
 #include <immintrin.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2(
@@ -29,7 +30,12 @@
   assert(mr <= 4);
   assert(nc != 0);
   assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   const uint8_t* a0 = a;
   uint8_t* c0 = c;
   const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
@@ -181,21 +187,6 @@
             _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
           vacc3x0123 = _mm_add_epi32(vacc3x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-          if (k > 6 * sizeof(uint8_t)) {
-            const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8);
-            const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point);
-
-            vacc0x0123 = _mm_add_epi32(vacc0x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc1x0123 = _mm_add_epi32(vacc1x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc2x0123 = _mm_add_epi32(vacc2x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            vacc3x0123 = _mm_add_epi32(vacc3x0123,
-              _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-          }
         }
       }
     }

diff --git a/src/qu8-igemm/4x4c2-minmax-sse2.c b/src/qu8-igemm/4x4c2-minmax-sse2.c
index ba7573c..cef6f81 100644
--- a/src/qu8-igemm/4x4c2-minmax-sse2.c
+++ b/src/qu8-igemm/4x4c2-minmax-sse2.c

@@ -11,6 +11,7 @@
 #include <immintrin.h>
 
 #include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2(
@@ -33,7 +34,12 @@
   assert(kc != 0);
   assert(ks != 0);
   assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
 
+  kc = round_up_po2(kc, 2);
   uint8_t* c0 = c;
   uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
   if XNN_UNPREDICTABLE(mr < 2) {
@@ -163,17 +169,6 @@
             vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
             vacc2x0123 = _mm_add_epi32(vacc2x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
             vacc3x0123 = _mm_add_epi32(vacc3x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
-            if (k > 6 * sizeof(uint8_t)) {
-              const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-              const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point);
-              w = (void*) ((uintptr_t) w + 8);
-
-              vacc0x0123 = _mm_add_epi32(vacc0x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc2x0123 = _mm_add_epi32(vacc2x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-              vacc3x0123 = _mm_add_epi32(vacc3x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-            }
           }
         }
       }
commit	6d8ca7d88ead578661f47ce8c5c6c24b3edc4928	[log] [tgz]
author	Frank Barchard <fbarchard@google.com>	Mon Mar 01 11:05:08 2021 -0800
committer	XNNPACK Team <xnnpack-github-robot@google.com>	Mon Mar 01 11:06:15 2021 -0800
tree	2bb651b2e2d8945e16cd5cc39996523012d076eb
parent	02121caa363ea04fda5f79ef073cf4884ab35279 [diff]