Reoptimize SSE QS8 GAVGPOOL microkernels

PiperOrigin-RevId: 421194667
diff --git a/src/amalgam/sse2.c b/src/amalgam/sse2.c
index 52973ee..a32a5ec 100644
--- a/src/amalgam/sse2.c
+++ b/src/amalgam/sse2.c
@@ -5812,13 +5812,13 @@
     const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
     i6 += 8;
 
-    const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi0x01234567));
-    const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi1x01234567));
-    const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi2x01234567));
-    const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi3x01234567));
-    const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi4x01234567));
-    const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi5x01234567));
-    const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi6x01234567));
+    const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
+    const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
+    const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
+    const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
+    const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
+    const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
+    const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
 
     __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
     __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567);
@@ -5874,13 +5874,13 @@
       const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
       i6 += 8;
 
-      const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi0x01234567));
-      const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi1x01234567));
-      const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi2x01234567));
-      const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi3x01234567));
-      const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi4x01234567));
-      const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi5x01234567));
-      const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi6x01234567));
+      const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
+      const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
+      const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
+      const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
+      const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
+      const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
+      const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
 
       __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
       __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567);
@@ -5919,13 +5919,14 @@
         vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
         output += 4;
       }
+      uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
       if (channels & 2) {
-        *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
-        vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+        *((uint16_t*) output) = (uint16_t) vout0123;
+        vout0123 >>= 16;
         output += 2;
       }
       if (channels & 1) {
-        *output = (int32_t) _mm_cvtsi128_si32(vout0123456701234567);
+        *output = (int8_t) vout0123;
       }
     }
   }
@@ -6096,13 +6097,14 @@
         vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
         output += 4;
       }
+      uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
       if (channels & 2) {
-        *((uint16_t*) output) = (uint16_t) _mm_cvtsi128_si32(vout0123456701234567);
-        vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+        *((uint16_t*) output) = (uint16_t) vout0123;
+        vout0123 >>= 16;
         output += 2;
       }
       if (channels & 1) {
-        *output = (int32_t) _mm_cvtsi128_si32(vout0123456701234567);
+        *output = (int8_t) vout0123;
       }
     }
   }
diff --git a/src/amalgam/sse41.c b/src/amalgam/sse41.c
index 2046af6..33d21a0 100644
--- a/src/amalgam/sse41.c
+++ b/src/amalgam/sse41.c
@@ -4041,7 +4041,7 @@
     vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vacc1x01234567);
 
     __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x01234567), _mm_load_si128((const __m128i*) (buffer + 0)));
-    __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567)), _mm_load_si128((const __m128i*) (buffer + 4)));
+    __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc0x01234567, vacc0x01234567), 16), _mm_load_si128((const __m128i*) (buffer + 4)));
     buffer += 8;
 
     __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
@@ -4095,7 +4095,7 @@
       vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vacc1x01234567);
 
       __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x01234567), _mm_load_si128((const __m128i*) buffer));
-      __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567)), _mm_load_si128((const __m128i*) (buffer + 4)));
+      __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc0x01234567, vacc0x01234567), 16), _mm_load_si128((const __m128i*) (buffer + 4)));
       buffer += 8;
 
       __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
@@ -4257,7 +4257,7 @@
       vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vacc1x01234567);
 
       __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x01234567), vinit_bias);
-      __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567)), vinit_bias);
+      __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc0x01234567, vacc0x01234567), 16), vinit_bias);
 
       __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
       __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16-acc2.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16-acc2.c
index 84ed16c..fb76f1f 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16-acc2.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16-acc2.c
@@ -241,20 +241,20 @@
     const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
     i6 += 16;
 
-    const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi0x01234567));
-    const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi0x89ABCDEF));
-    const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi1x01234567));
-    const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi1x89ABCDEF));
-    const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi2x01234567));
-    const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi2x89ABCDEF));
-    const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi3x01234567));
-    const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi3x89ABCDEF));
-    const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi4x01234567));
-    const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi4x89ABCDEF));
-    const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi5x01234567));
-    const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi5x89ABCDEF));
-    const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi6x01234567));
-    const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi6x89ABCDEF));
+    const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
+    const __m128i vxi0x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x89ABCDEF, vi0x89ABCDEF), 8);
+    const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
+    const __m128i vxi1x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x89ABCDEF, vi1x89ABCDEF), 8);
+    const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
+    const __m128i vxi2x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x89ABCDEF, vi2x89ABCDEF), 8);
+    const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
+    const __m128i vxi3x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x89ABCDEF, vi3x89ABCDEF), 8);
+    const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
+    const __m128i vxi4x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x89ABCDEF, vi4x89ABCDEF), 8);
+    const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
+    const __m128i vxi5x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x89ABCDEF, vi5x89ABCDEF), 8);
+    const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
+    const __m128i vxi6x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x89ABCDEF, vi6x89ABCDEF), 8);
 
     __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
     __m128i vacc0x89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
@@ -329,13 +329,13 @@
       const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
       i6 += 8;
 
-      const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi0x01234567));
-      const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi1x01234567));
-      const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi2x01234567));
-      const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi3x01234567));
-      const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi4x01234567));
-      const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi5x01234567));
-      const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi6x01234567));
+      const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
+      const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
+      const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
+      const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
+      const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
+      const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
+      const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
 
       __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
       __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567);
@@ -379,13 +379,14 @@
           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
           output += 4;
         }
+        uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
         if (channels & 2) {
-          *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
-          vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+          *((uint16_t*) output) = (uint16_t) vout0123;
+          vout0123 >>= 16;
           output += 2;
         }
         if (channels & 1) {
-          *output = (int32_t) _mm_cvtsi128_si32(vout0123456701234567);
+          *output = (int8_t) vout0123;
           output += 1;
         }
         channels = 0;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24-acc2.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24-acc2.c
index afe5992..23b7770 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24-acc2.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24-acc2.c
@@ -390,27 +390,27 @@
     const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16));
     i6 += 24;
 
-    const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi0x01234567));
-    const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi0x89ABCDEF));
-    const __m128i vxi0xGHIJKLMN = _mm_unpacklo_epi8(vi0xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), vi0xGHIJKLMN));
-    const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi1x01234567));
-    const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi1x89ABCDEF));
-    const __m128i vxi1xGHIJKLMN = _mm_unpacklo_epi8(vi1xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), vi1xGHIJKLMN));
-    const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi2x01234567));
-    const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi2x89ABCDEF));
-    const __m128i vxi2xGHIJKLMN = _mm_unpacklo_epi8(vi2xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), vi2xGHIJKLMN));
-    const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi3x01234567));
-    const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi3x89ABCDEF));
-    const __m128i vxi3xGHIJKLMN = _mm_unpacklo_epi8(vi3xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), vi3xGHIJKLMN));
-    const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi4x01234567));
-    const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi4x89ABCDEF));
-    const __m128i vxi4xGHIJKLMN = _mm_unpacklo_epi8(vi4xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), vi4xGHIJKLMN));
-    const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi5x01234567));
-    const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi5x89ABCDEF));
-    const __m128i vxi5xGHIJKLMN = _mm_unpacklo_epi8(vi5xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), vi5xGHIJKLMN));
-    const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi6x01234567));
-    const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vi6x89ABCDEF));
-    const __m128i vxi6xGHIJKLMN = _mm_unpacklo_epi8(vi6xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), vi6xGHIJKLMN));
+    const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
+    const __m128i vxi0x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x89ABCDEF, vi0x89ABCDEF), 8);
+    const __m128i vxi0xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi0xGHIJKLMN, vi0xGHIJKLMN), 8);
+    const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
+    const __m128i vxi1x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x89ABCDEF, vi1x89ABCDEF), 8);
+    const __m128i vxi1xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi1xGHIJKLMN, vi1xGHIJKLMN), 8);
+    const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
+    const __m128i vxi2x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x89ABCDEF, vi2x89ABCDEF), 8);
+    const __m128i vxi2xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi2xGHIJKLMN, vi2xGHIJKLMN), 8);
+    const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
+    const __m128i vxi3x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x89ABCDEF, vi3x89ABCDEF), 8);
+    const __m128i vxi3xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi3xGHIJKLMN, vi3xGHIJKLMN), 8);
+    const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
+    const __m128i vxi4x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x89ABCDEF, vi4x89ABCDEF), 8);
+    const __m128i vxi4xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi4xGHIJKLMN, vi4xGHIJKLMN), 8);
+    const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
+    const __m128i vxi5x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x89ABCDEF, vi5x89ABCDEF), 8);
+    const __m128i vxi5xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi5xGHIJKLMN, vi5xGHIJKLMN), 8);
+    const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
+    const __m128i vxi6x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x89ABCDEF, vi6x89ABCDEF), 8);
+    const __m128i vxi6xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi6xGHIJKLMN, vi6xGHIJKLMN), 8);
 
     __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
     __m128i vacc0x89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
@@ -506,13 +506,13 @@
       const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
       i6 += 8;
 
-      const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi0x01234567));
-      const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi1x01234567));
-      const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi2x01234567));
-      const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi3x01234567));
-      const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi4x01234567));
-      const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi5x01234567));
-      const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi6x01234567));
+      const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
+      const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
+      const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
+      const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
+      const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
+      const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
+      const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
 
       __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
       __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567);
@@ -556,13 +556,14 @@
           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
           output += 4;
         }
+        uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
         if (channels & 2) {
-          *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
-          vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+          *((uint16_t*) output) = (uint16_t) vout0123;
+          vout0123 >>= 16;
           output += 2;
         }
         if (channels & 1) {
-          *output = (int32_t) _mm_cvtsi128_si32(vout0123456701234567);
+          *output = (int8_t) vout0123;
           output += 1;
         }
         channels = 0;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8-acc2.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8-acc2.c
index 348d05c..a1fa144 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8-acc2.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8-acc2.c
@@ -184,13 +184,13 @@
     const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
     i6 += 8;
 
-    const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi0x01234567));
-    const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi1x01234567));
-    const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi2x01234567));
-    const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi3x01234567));
-    const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi4x01234567));
-    const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi5x01234567));
-    const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi6x01234567));
+    const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
+    const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
+    const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
+    const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
+    const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
+    const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
+    const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
 
     __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
     __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567);
@@ -246,13 +246,13 @@
       const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
       i6 += 8;
 
-      const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi0x01234567));
-      const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi1x01234567));
-      const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi2x01234567));
-      const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi3x01234567));
-      const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi4x01234567));
-      const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi5x01234567));
-      const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vi6x01234567));
+      const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
+      const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
+      const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
+      const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
+      const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
+      const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
+      const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
 
       __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
       __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567);
@@ -291,13 +291,14 @@
         vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
         output += 4;
       }
+      uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
       if (channels & 2) {
-        *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
-        vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+        *((uint16_t*) output) = (uint16_t) vout0123;
+        vout0123 >>= 16;
         output += 2;
       }
       if (channels & 1) {
-        *output = (int32_t) _mm_cvtsi128_si32(vout0123456701234567);
+        *output = (int8_t) vout0123;
       }
     }
   }
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16-acc2.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16-acc2.c
index 3cc0c76..c42eb38 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16-acc2.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16-acc2.c
@@ -227,9 +227,9 @@
     vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vacc1x89ABCDEF);
 
     __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x01234567), _mm_load_si128((const __m128i*) (buffer + 0)));
-    __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567)), _mm_load_si128((const __m128i*) (buffer + 4)));
+    __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc0x01234567, vacc0x01234567), 16), _mm_load_si128((const __m128i*) (buffer + 4)));
     __m128i vacc89AB = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x89ABCDEF), _mm_load_si128((const __m128i*) (buffer + 8)));
-    __m128i vaccCDEF = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x89ABCDEF, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x89ABCDEF)), _mm_load_si128((const __m128i*) (buffer + 12)));
+    __m128i vaccCDEF = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc0x89ABCDEF, vacc0x89ABCDEF), 16), _mm_load_si128((const __m128i*) (buffer + 12)));
     buffer += 16;
 
     __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
@@ -292,7 +292,7 @@
       vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vacc1x01234567);
 
       __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x01234567), _mm_load_si128((const __m128i*) buffer));
-      __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567)), _mm_load_si128((const __m128i*) (buffer + 4)));
+      __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc0x01234567, vacc0x01234567), 16), _mm_load_si128((const __m128i*) (buffer + 4)));
       buffer += 8;
 
       __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24-acc2.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24-acc2.c
index 7486295..4682426 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24-acc2.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24-acc2.c
@@ -350,11 +350,11 @@
     vacc0xGHIJKLMN = _mm_add_epi16(vacc0xGHIJKLMN, vacc1xGHIJKLMN);
 
     __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x01234567), _mm_load_si128((const __m128i*) (buffer + 0)));
-    __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567)), _mm_load_si128((const __m128i*) (buffer + 4)));
+    __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc0x01234567, vacc0x01234567), 16), _mm_load_si128((const __m128i*) (buffer + 4)));
     __m128i vacc89AB = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x89ABCDEF), _mm_load_si128((const __m128i*) (buffer + 8)));
-    __m128i vaccCDEF = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x89ABCDEF, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x89ABCDEF)), _mm_load_si128((const __m128i*) (buffer + 12)));
+    __m128i vaccCDEF = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc0x89ABCDEF, vacc0x89ABCDEF), 16), _mm_load_si128((const __m128i*) (buffer + 12)));
     __m128i vaccGHIJ = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0xGHIJKLMN), _mm_load_si128((const __m128i*) (buffer + 16)));
-    __m128i vaccKLMN = _mm_add_epi32(_mm_unpackhi_epi16(vacc0xGHIJKLMN, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0xGHIJKLMN)), _mm_load_si128((const __m128i*) (buffer + 20)));
+    __m128i vaccKLMN = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc0xGHIJKLMN, vacc0xGHIJKLMN), 16), _mm_load_si128((const __m128i*) (buffer + 20)));
     buffer += 24;
 
     __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
@@ -429,7 +429,7 @@
       vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vacc1x01234567);
 
       __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x01234567), _mm_load_si128((const __m128i*) buffer));
-      __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567)), _mm_load_si128((const __m128i*) (buffer + 4)));
+      __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc0x01234567, vacc0x01234567), 16), _mm_load_si128((const __m128i*) (buffer + 4)));
       buffer += 8;
 
       __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8-acc2.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8-acc2.c
index fa4aa2b..1ff77ec 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8-acc2.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8-acc2.c
@@ -180,7 +180,7 @@
     vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vacc1x01234567);
 
     __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x01234567), _mm_load_si128((const __m128i*) (buffer + 0)));
-    __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567)), _mm_load_si128((const __m128i*) (buffer + 4)));
+    __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc0x01234567, vacc0x01234567), 16), _mm_load_si128((const __m128i*) (buffer + 4)));
     buffer += 8;
 
     __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
@@ -234,7 +234,7 @@
       vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vacc1x01234567);
 
       __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x01234567), _mm_load_si128((const __m128i*) buffer));
-      __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567)), _mm_load_si128((const __m128i*) (buffer + 4)));
+      __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc0x01234567, vacc0x01234567), 16), _mm_load_si128((const __m128i*) (buffer + 4)));
       buffer += 8;
 
       __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c16-acc2.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c16-acc2.c
index a0c64c2..7ae442f 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c16-acc2.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c16-acc2.c
@@ -217,13 +217,14 @@
           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
           output += 4;
         }
+        uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
         if (channels & 2) {
-          *((uint16_t*) output) = (uint16_t) _mm_cvtsi128_si32(vout0123456701234567);
-          vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+          *((uint16_t*) output) = (uint16_t) vout0123;
+          vout0123 >>= 16;
           output += 2;
         }
         if (channels & 1) {
-          *output = (int32_t) _mm_cvtsi128_si32(vout0123456701234567);
+          *output = (int8_t) vout0123;
           output += 1;
         }
         channels = 0;
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c24-acc2.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c24-acc2.c
index 0d89a9c..99b375e 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c24-acc2.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c24-acc2.c
@@ -252,13 +252,14 @@
           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
           output += 4;
         }
+        uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
         if (channels & 2) {
-          *((uint16_t*) output) = (uint16_t) _mm_cvtsi128_si32(vout0123456701234567);
-          vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+          *((uint16_t*) output) = (uint16_t) vout0123;
+          vout0123 >>= 16;
           output += 2;
         }
         if (channels & 1) {
-          *output = (int32_t) _mm_cvtsi128_si32(vout0123456701234567);
+          *output = (int8_t) vout0123;
           output += 1;
         }
         channels = 0;
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c8-acc2.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c8-acc2.c
index 5e5b97e..21a9400 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c8-acc2.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c8-acc2.c
@@ -179,13 +179,14 @@
         vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
         output += 4;
       }
+      uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
       if (channels & 2) {
-        *((uint16_t*) output) = (uint16_t) _mm_cvtsi128_si32(vout0123456701234567);
-        vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+        *((uint16_t*) output) = (uint16_t) vout0123;
+        vout0123 >>= 16;
         output += 2;
       }
       if (channels & 1) {
-        *output = (int32_t) _mm_cvtsi128_si32(vout0123456701234567);
+        *output = (int8_t) vout0123;
       }
     }
   }
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c16-acc2.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c16-acc2.c
index d814769..0785a87 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c16-acc2.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c16-acc2.c
@@ -163,7 +163,7 @@
       vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vacc1x01234567);
 
       __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x01234567), vinit_bias);
-      __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567)), vinit_bias);
+      __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc0x01234567, vacc0x01234567), 16), vinit_bias);
 
       __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
       __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c24-acc2.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c24-acc2.c
index e6335e7..04a1103 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c24-acc2.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c24-acc2.c
@@ -190,7 +190,7 @@
       vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vacc1x01234567);
 
       __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x01234567), vinit_bias);
-      __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567)), vinit_bias);
+      __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc0x01234567, vacc0x01234567), 16), vinit_bias);
 
       __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
       __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
diff --git a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c8-acc2.c b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c8-acc2.c
index ec713af..e4cd199 100644
--- a/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c8-acc2.c
+++ b/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c8-acc2.c
@@ -139,7 +139,7 @@
       vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vacc1x01234567);
 
       __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x01234567), vinit_bias);
-      __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567)), vinit_bias);
+      __m128i vacc4567 = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc0x01234567, vacc0x01234567), 16), vinit_bias);
 
       __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
       __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
diff --git a/src/qs8-gavgpool/multipass-sse.c.in b/src/qs8-gavgpool/multipass-sse.c.in
index f3d7071..411b9a5 100644
--- a/src/qs8-gavgpool/multipass-sse.c.in
+++ b/src/qs8-gavgpool/multipass-sse.c.in
@@ -274,7 +274,7 @@
     $if SSE < 4:
       $for M in range(ROW_SUBTILE):
         $for C in range(0, CHANNEL_TILE, 8):
-          const __m128i vxi${M}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${M}x${ABC[C:C+8]}, _mm_cmpgt_epi8(_mm_setzero_si128(), vi${M}x${ABC[C:C+8]}));
+          const __m128i vxi${M}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M}x${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]}), 8);
 
     $for A in range(ACCUMULATORS):
       $for C in range(0, CHANNEL_TILE, 8):
@@ -297,7 +297,7 @@
     $for C in range(0, CHANNEL_TILE, 8):
       $if SSE >= 4:
         __m128i vacc${ABC[C:C+4]} = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x${ABC[C:C+8]}), _mm_load_si128((const __m128i*) (buffer + ${C})));
-        __m128i vacc${ABC[C+4:C+8]} = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x${ABC[C:C+8]}, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x${ABC[C:C+8]})), _mm_load_si128((const __m128i*) (buffer + ${C+4})));
+        __m128i vacc${ABC[C+4:C+8]} = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc0x${ABC[C:C+8]}, vacc0x${ABC[C:C+8]}), 16), _mm_load_si128((const __m128i*) (buffer + ${C+4})));
       $else:
         const __m128i vsgnacc0x${ABC[C:C+8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x${ABC[C:C+8]});
         __m128i vacc${ABC[C:C+4]} = _mm_add_epi32(_mm_unpacklo_epi16(vacc0x${ABC[C:C+8]}, vsgnacc0x${ABC[C:C+8]}), _mm_load_si128((const __m128i*) (buffer + ${C})));
@@ -358,7 +358,7 @@
 
       $if SSE < 4:
         $for M in range(ROW_SUBTILE):
-          const __m128i vxi${M}x${ABC[0:8]} = _mm_unpacklo_epi8(vi${M}x${ABC[0:8]}, _mm_cmpgt_epi8(_mm_setzero_si128(), vi${M}x${ABC[0:8]}));
+          const __m128i vxi${M}x${ABC[0:8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M}x${ABC[0:8]}, vi${M}x${ABC[0:8]}), 8);
 
       $for A in range(ACCUMULATORS):
         __m128i vacc${A}x${ABC[0:8]} = _mm_add_epi16(vxi${A*2}x${ABC[0:8]}, vxi${A*2+1}x${ABC[0:8]});
@@ -377,7 +377,7 @@
 
       $if SSE >= 4:
         __m128i vacc${ABC[0:4]} = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x${ABC[0:8]}), _mm_load_si128((const __m128i*) buffer));
-        __m128i vacc${ABC[4:8]} = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x${ABC[0:8]}, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x${ABC[0:8]})), _mm_load_si128((const __m128i*) (buffer + 4)));
+        __m128i vacc${ABC[4:8]} = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc0x${ABC[0:8]}, vacc0x${ABC[0:8]}), 16), _mm_load_si128((const __m128i*) (buffer + 4)));
       $else:
         const __m128i vsgnacc0x${ABC[0:8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x${ABC[0:8]});
         __m128i vacc${ABC[0:4]} = _mm_add_epi32(_mm_unpacklo_epi16(vacc0x${ABC[0:8]}, vsgnacc0x${ABC[0:8]}), _mm_load_si128((const __m128i*) buffer));
@@ -410,6 +410,41 @@
           output += 8;
           channels -= 8;
         } else {
+          $if SSE == 4:
+            if (channels & 4) {
+              *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
+              vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32);
+              output += 4;
+            }
+            if (channels & 2) {
+              *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout${ABC[0:8]}${ABC[0:8]}, 0);
+              vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi32(vout${ABC[0:8]}${ABC[0:8]}, 16);
+              output += 2;
+            }
+            if (channels & 1) {
+              *output = (int8_t) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0);
+              output += 1;
+            }
+          $else:
+            if (channels & 4) {
+              *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
+              vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32);
+              output += 4;
+            }
+            uint32_t vout${ABC[0:4]} = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
+            if (channels & 2) {
+              *((uint16_t*) output) = (uint16_t) vout${ABC[0:4]};
+              vout${ABC[0:4]} >>= 16;
+              output += 2;
+            }
+            if (channels & 1) {
+              *output = (int8_t) vout${ABC[0:4]};
+              output += 1;
+            }
+          channels = 0;
+        }
+      $else:
+        $if SSE == 4:
           if (channels & 4) {
             *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
             vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32);
@@ -421,31 +456,23 @@
             output += 2;
           }
           if (channels & 1) {
-            $if SSE >= 4:
-              *output = (int8_t) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0);
-            $else:
-              *output = (int32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
-            output += 1;
-          }
-          channels = 0;
-        }
-      $else:
-        if (channels & 4) {
-          *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
-          vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32);
-          output += 4;
-        }
-        if (channels & 2) {
-          *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout${ABC[0:8]}${ABC[0:8]}, 0);
-          vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi32(vout${ABC[0:8]}${ABC[0:8]}, 16);
-          output += 2;
-        }
-        if (channels & 1) {
-          $if SSE >= 4:
             *output = (int8_t) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0);
-          $else:
-            *output = (int32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
-        }
+          }
+        $else:
+          if (channels & 4) {
+            *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
+            vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32);
+            output += 4;
+          }
+          uint32_t vout${ABC[0:4]} = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
+          if (channels & 2) {
+            *((uint16_t*) output) = (uint16_t) vout${ABC[0:4]};
+            vout${ABC[0:4]} >>= 16;
+            output += 2;
+          }
+          if (channels & 1) {
+            *output = (int8_t) vout${ABC[0:4]};
+          }
     }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""}
   }
 }
diff --git a/src/qs8-gavgpool/unipass-sse.c.in b/src/qs8-gavgpool/unipass-sse.c.in
index 73324aa..7656b88 100644
--- a/src/qs8-gavgpool/unipass-sse.c.in
+++ b/src/qs8-gavgpool/unipass-sse.c.in
@@ -167,7 +167,7 @@
 
       $if SSE == 4:
         __m128i vacc${ABC[0:4]} = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x${ABC[0:8]}), vinit_bias);
-        __m128i vacc${ABC[4:8]} = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x${ABC[0:8]}, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x${ABC[0:8]})), vinit_bias);
+        __m128i vacc${ABC[4:8]} = _mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(vacc0x${ABC[0:8]}, vacc0x${ABC[0:8]}), 16), vinit_bias);
       $else:
         const __m128i vsgnacc0x${ABC[0:8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x${ABC[0:8]});
         __m128i vacc${ABC[0:4]} = _mm_add_epi32(_mm_unpacklo_epi16(vacc0x${ABC[0:8]}, vsgnacc0x${ABC[0:8]}), vinit_bias);
@@ -199,48 +199,69 @@
           output += 8;
           channels -= 8;
         } else {
+          $if SSE == 4:
+            if (channels & 4) {
+              *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
+              vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32);
+              output += 4;
+            }
+            if (channels & 2) {
+              *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout${ABC[0:8]}${ABC[0:8]}, 0);
+              vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi32(vout${ABC[0:8]}${ABC[0:8]}, 16);
+              output += 2;
+            }
+            if (channels & 1) {
+              *output = (int8_t) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0);
+              output += 1;
+            }
+          $else:
+            if (channels & 4) {
+              *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
+              vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32);
+              output += 4;
+            }
+            uint32_t vout${ABC[0:4]} = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
+            if (channels & 2) {
+              *((uint16_t*) output) = (uint16_t) vout${ABC[0:4]};
+              vout${ABC[0:4]} >>= 16;
+              output += 2;
+            }
+            if (channels & 1) {
+              *output = (int8_t) vout${ABC[0:4]};
+              output += 1;
+            }
+          channels = 0;
+        }
+      $else:
+        $if SSE == 4:
           if (channels & 4) {
             *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
             vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32);
             output += 4;
           }
           if (channels & 2) {
-            $if SSE == 4:
-              *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout${ABC[0:8]}${ABC[0:8]}, 0);
-            $else:
-              *((uint16_t*) output) = (uint16_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
+            *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout${ABC[0:8]}${ABC[0:8]}, 0);
             vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi32(vout${ABC[0:8]}${ABC[0:8]}, 16);
             output += 2;
           }
           if (channels & 1) {
-            $if SSE == 4:
-              *output = (int8_t) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0);
-            $else:
-              *output = (int32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
-            output += 1;
-          }
-          channels = 0;
-        }
-      $else:
-        if (channels & 4) {
-          *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
-          vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32);
-          output += 4;
-        }
-        if (channels & 2) {
-          $if SSE == 4:
-            *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout${ABC[0:8]}${ABC[0:8]}, 0);
-          $else:
-            *((uint16_t*) output) = (uint16_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
-          vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi32(vout${ABC[0:8]}${ABC[0:8]}, 16);
-          output += 2;
-        }
-        if (channels & 1) {
-          $if SSE == 4:
             *output = (int8_t) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0);
-          $else:
-            *output = (int32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
-        }
+          }
+        $else:
+          if (channels & 4) {
+            *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
+            vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32);
+            output += 4;
+          }
+          uint32_t vout${ABC[0:4]} = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
+          if (channels & 2) {
+            *((uint16_t*) output) = (uint16_t) vout${ABC[0:4]};
+            vout${ABC[0:4]} >>= 16;
+            output += 2;
+          }
+          if (channels & 1) {
+            *output = (int8_t) vout${ABC[0:4]};
+          }
     }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""}
   }
 }