blob: c00d376d484f1043f10521faee848638afc072cd [file] [log] [blame]
// Auto-generated file. Do not edit!
// Template: src/qu8-gemm/MRxNRc4-minmax-scalar.c.in
// Generator: tools/xngen
//
// Copyright 2020 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
#include <assert.h>
#include <xnnpack/gemm.h>
#include <xnnpack/scalar-utils.h>
// This kernel is a scalar model for a kernel using ARMv8.2 dot-product
// instructions.
//
// XNN_DISABLE_TSAN is used because this kernel reads up to 3 bytes past the
// bounds of the `a` matrix region, which may be a race condition with
// another thread. We deem this acceptable because the values that are
// read out of bounds do not affect the result, and the the compiler can't know
// about this undefined behavior.
void xnn_qu8_gemm_minmax_ukernel_12x4c4__scalar(
size_t mr,
size_t nc,
size_t kc,
const uint8_t* restrict a,
size_t a_stride,
const void* restrict w,
uint8_t* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_qu8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN {
assert(mr != 0);
assert(mr <= 12);
assert(nc != 0);
assert(kc != 0);
const uint8_t* a0 = a;
uint8_t* c0 = c;
const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
if XNN_UNPREDICTABLE(mr < 2) {
a1 = a0;
c1 = c0;
}
const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride);
uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
if XNN_UNPREDICTABLE(mr <= 2) {
a2 = a1;
c2 = c1;
}
const uint8_t* a3 = (const uint8_t*) ((uintptr_t) a2 + a_stride);
uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
if XNN_UNPREDICTABLE(mr < 4) {
a3 = a2;
c3 = c2;
}
const uint8_t* a4 = (const uint8_t*) ((uintptr_t) a3 + a_stride);
uint8_t* c4 = (uint8_t*) ((uintptr_t) c3 + cm_stride);
if XNN_UNPREDICTABLE(mr <= 4) {
a4 = a3;
c4 = c3;
}
const uint8_t* a5 = (const uint8_t*) ((uintptr_t) a4 + a_stride);
uint8_t* c5 = (uint8_t*) ((uintptr_t) c4 + cm_stride);
if XNN_UNPREDICTABLE(mr < 6) {
a5 = a4;
c5 = c4;
}
const uint8_t* a6 = (const uint8_t*) ((uintptr_t) a5 + a_stride);
uint8_t* c6 = (uint8_t*) ((uintptr_t) c5 + cm_stride);
if XNN_UNPREDICTABLE(mr <= 6) {
a6 = a5;
c6 = c5;
}
const uint8_t* a7 = (const uint8_t*) ((uintptr_t) a6 + a_stride);
uint8_t* c7 = (uint8_t*) ((uintptr_t) c6 + cm_stride);
if XNN_UNPREDICTABLE(mr < 8) {
a7 = a6;
c7 = c6;
}
const uint8_t* a8 = (const uint8_t*) ((uintptr_t) a7 + a_stride);
uint8_t* c8 = (uint8_t*) ((uintptr_t) c7 + cm_stride);
if XNN_UNPREDICTABLE(mr <= 8) {
a8 = a7;
c8 = c7;
}
const uint8_t* a9 = (const uint8_t*) ((uintptr_t) a8 + a_stride);
uint8_t* c9 = (uint8_t*) ((uintptr_t) c8 + cm_stride);
if XNN_UNPREDICTABLE(mr < 10) {
a9 = a8;
c9 = c8;
}
const uint8_t* a10 = (const uint8_t*) ((uintptr_t) a9 + a_stride);
uint8_t* c10 = (uint8_t*) ((uintptr_t) c9 + cm_stride);
if XNN_UNPREDICTABLE(mr <= 10) {
a10 = a9;
c10 = c9;
}
const uint8_t* a11 = (const uint8_t*) ((uintptr_t) a10 + a_stride);
uint8_t* c11 = (uint8_t*) ((uintptr_t) c10 + cm_stride);
if XNN_UNPREDICTABLE(mr != 12) {
a11 = a10;
c11 = c10;
}
const int32_t vb_zero_point = params->scalar.kernel_zero_point;
// Loop over groups of 4 columns.
do {
// `vaccMN` is the accumulator at row `M`, column `N`.
// Initialize accumulators with bias. 4 bias values are loaded from the
// weight matrix, at the start of the group of 4 columns.
int32_t bias0 = ((const int32_t*)w)[0];
int32_t vacc00 = bias0;
int32_t vacc10 = bias0;
int32_t vacc20 = bias0;
int32_t vacc30 = bias0;
int32_t vacc40 = bias0;
int32_t vacc50 = bias0;
int32_t vacc60 = bias0;
int32_t vacc70 = bias0;
int32_t vacc80 = bias0;
int32_t vacc90 = bias0;
int32_t vacc100 = bias0;
int32_t vacc110 = bias0;
int32_t bias1 = ((const int32_t*)w)[1];
int32_t vacc01 = bias1;
int32_t vacc11 = bias1;
int32_t vacc21 = bias1;
int32_t vacc31 = bias1;
int32_t vacc41 = bias1;
int32_t vacc51 = bias1;
int32_t vacc61 = bias1;
int32_t vacc71 = bias1;
int32_t vacc81 = bias1;
int32_t vacc91 = bias1;
int32_t vacc101 = bias1;
int32_t vacc111 = bias1;
int32_t bias2 = ((const int32_t*)w)[2];
int32_t vacc02 = bias2;
int32_t vacc12 = bias2;
int32_t vacc22 = bias2;
int32_t vacc32 = bias2;
int32_t vacc42 = bias2;
int32_t vacc52 = bias2;
int32_t vacc62 = bias2;
int32_t vacc72 = bias2;
int32_t vacc82 = bias2;
int32_t vacc92 = bias2;
int32_t vacc102 = bias2;
int32_t vacc112 = bias2;
int32_t bias3 = ((const int32_t*)w)[3];
int32_t vacc03 = bias3;
int32_t vacc13 = bias3;
int32_t vacc23 = bias3;
int32_t vacc33 = bias3;
int32_t vacc43 = bias3;
int32_t vacc53 = bias3;
int32_t vacc63 = bias3;
int32_t vacc73 = bias3;
int32_t vacc83 = bias3;
int32_t vacc93 = bias3;
int32_t vacc103 = bias3;
int32_t vacc113 = bias3;
w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
// Inner accumulation loop along the 4 columns.
// Handle 4 rows at each iteration: this is key to modelling what an
// actual kernel using ARMv8.2 dot-product instructions would look like.
size_t k = 0;
while (k < kc) {
// Load a 12x4 block of activations, and compute sums along rows.
int16_t vasum0 = 0;
int32_t va00 = *a0++;
vasum0 += (int16_t) va00;
int32_t va01 = *a0++;
vasum0 += (int16_t) va01;
int32_t va02 = *a0++;
vasum0 += (int16_t) va02;
int32_t va03 = *a0++;
vasum0 += (int16_t) va03;
int16_t vasum1 = 0;
int32_t va10 = *a1++;
vasum1 += (int16_t) va10;
int32_t va11 = *a1++;
vasum1 += (int16_t) va11;
int32_t va12 = *a1++;
vasum1 += (int16_t) va12;
int32_t va13 = *a1++;
vasum1 += (int16_t) va13;
int16_t vasum2 = 0;
int32_t va20 = *a2++;
vasum2 += (int16_t) va20;
int32_t va21 = *a2++;
vasum2 += (int16_t) va21;
int32_t va22 = *a2++;
vasum2 += (int16_t) va22;
int32_t va23 = *a2++;
vasum2 += (int16_t) va23;
int16_t vasum3 = 0;
int32_t va30 = *a3++;
vasum3 += (int16_t) va30;
int32_t va31 = *a3++;
vasum3 += (int16_t) va31;
int32_t va32 = *a3++;
vasum3 += (int16_t) va32;
int32_t va33 = *a3++;
vasum3 += (int16_t) va33;
int16_t vasum4 = 0;
int32_t va40 = *a4++;
vasum4 += (int16_t) va40;
int32_t va41 = *a4++;
vasum4 += (int16_t) va41;
int32_t va42 = *a4++;
vasum4 += (int16_t) va42;
int32_t va43 = *a4++;
vasum4 += (int16_t) va43;
int16_t vasum5 = 0;
int32_t va50 = *a5++;
vasum5 += (int16_t) va50;
int32_t va51 = *a5++;
vasum5 += (int16_t) va51;
int32_t va52 = *a5++;
vasum5 += (int16_t) va52;
int32_t va53 = *a5++;
vasum5 += (int16_t) va53;
int16_t vasum6 = 0;
int32_t va60 = *a6++;
vasum6 += (int16_t) va60;
int32_t va61 = *a6++;
vasum6 += (int16_t) va61;
int32_t va62 = *a6++;
vasum6 += (int16_t) va62;
int32_t va63 = *a6++;
vasum6 += (int16_t) va63;
int16_t vasum7 = 0;
int32_t va70 = *a7++;
vasum7 += (int16_t) va70;
int32_t va71 = *a7++;
vasum7 += (int16_t) va71;
int32_t va72 = *a7++;
vasum7 += (int16_t) va72;
int32_t va73 = *a7++;
vasum7 += (int16_t) va73;
int16_t vasum8 = 0;
int32_t va80 = *a8++;
vasum8 += (int16_t) va80;
int32_t va81 = *a8++;
vasum8 += (int16_t) va81;
int32_t va82 = *a8++;
vasum8 += (int16_t) va82;
int32_t va83 = *a8++;
vasum8 += (int16_t) va83;
int16_t vasum9 = 0;
int32_t va90 = *a9++;
vasum9 += (int16_t) va90;
int32_t va91 = *a9++;
vasum9 += (int16_t) va91;
int32_t va92 = *a9++;
vasum9 += (int16_t) va92;
int32_t va93 = *a9++;
vasum9 += (int16_t) va93;
int16_t vasum10 = 0;
int32_t va100 = *a10++;
vasum10 += (int16_t) va100;
int32_t va101 = *a10++;
vasum10 += (int16_t) va101;
int32_t va102 = *a10++;
vasum10 += (int16_t) va102;
int32_t va103 = *a10++;
vasum10 += (int16_t) va103;
int16_t vasum11 = 0;
int32_t va110 = *a11++;
vasum11 += (int16_t) va110;
int32_t va111 = *a11++;
vasum11 += (int16_t) va111;
int32_t va112 = *a11++;
vasum11 += (int16_t) va112;
int32_t va113 = *a11++;
vasum11 += (int16_t) va113;
// Load a 4x4 block of weights.
int32_t vb00 = (int32_t) ((const uint8_t*)w)[0];
int32_t vb10 = (int32_t) ((const uint8_t*)w)[1];
int32_t vb20 = (int32_t) ((const uint8_t*)w)[2];
int32_t vb30 = (int32_t) ((const uint8_t*)w)[3];
w = (const void*) ((uintptr_t) w + 4 * sizeof(uint8_t));
int32_t vb01 = (int32_t) ((const uint8_t*)w)[0];
int32_t vb11 = (int32_t) ((const uint8_t*)w)[1];
int32_t vb21 = (int32_t) ((const uint8_t*)w)[2];
int32_t vb31 = (int32_t) ((const uint8_t*)w)[3];
w = (const void*) ((uintptr_t) w + 4 * sizeof(uint8_t));
int32_t vb02 = (int32_t) ((const uint8_t*)w)[0];
int32_t vb12 = (int32_t) ((const uint8_t*)w)[1];
int32_t vb22 = (int32_t) ((const uint8_t*)w)[2];
int32_t vb32 = (int32_t) ((const uint8_t*)w)[3];
w = (const void*) ((uintptr_t) w + 4 * sizeof(uint8_t));
int32_t vb03 = (int32_t) ((const uint8_t*)w)[0];
int32_t vb13 = (int32_t) ((const uint8_t*)w)[1];
int32_t vb23 = (int32_t) ((const uint8_t*)w)[2];
int32_t vb33 = (int32_t) ((const uint8_t*)w)[3];
w = (const void*) ((uintptr_t) w + 4 * sizeof(uint8_t));
// Multiply-accumulate: 12x4 * 4x4 --> 12x4. The inner size 4 here means
// we're computing 4D dot-products, which makes this a model for
// a ARMv8.2 dot-product kernel.
vacc00 += va00 * vb00;
vacc00 += va01 * vb10;
vacc00 += va02 * vb20;
vacc00 += va03 * vb30;
vacc00 -= ((int32_t) vasum0) * vb_zero_point;
vacc01 += va00 * vb01;
vacc01 += va01 * vb11;
vacc01 += va02 * vb21;
vacc01 += va03 * vb31;
vacc01 -= ((int32_t) vasum0) * vb_zero_point;
vacc02 += va00 * vb02;
vacc02 += va01 * vb12;
vacc02 += va02 * vb22;
vacc02 += va03 * vb32;
vacc02 -= ((int32_t) vasum0) * vb_zero_point;
vacc03 += va00 * vb03;
vacc03 += va01 * vb13;
vacc03 += va02 * vb23;
vacc03 += va03 * vb33;
vacc03 -= ((int32_t) vasum0) * vb_zero_point;
vacc10 += va10 * vb00;
vacc10 += va11 * vb10;
vacc10 += va12 * vb20;
vacc10 += va13 * vb30;
vacc10 -= ((int32_t) vasum1) * vb_zero_point;
vacc11 += va10 * vb01;
vacc11 += va11 * vb11;
vacc11 += va12 * vb21;
vacc11 += va13 * vb31;
vacc11 -= ((int32_t) vasum1) * vb_zero_point;
vacc12 += va10 * vb02;
vacc12 += va11 * vb12;
vacc12 += va12 * vb22;
vacc12 += va13 * vb32;
vacc12 -= ((int32_t) vasum1) * vb_zero_point;
vacc13 += va10 * vb03;
vacc13 += va11 * vb13;
vacc13 += va12 * vb23;
vacc13 += va13 * vb33;
vacc13 -= ((int32_t) vasum1) * vb_zero_point;
vacc20 += va20 * vb00;
vacc20 += va21 * vb10;
vacc20 += va22 * vb20;
vacc20 += va23 * vb30;
vacc20 -= ((int32_t) vasum2) * vb_zero_point;
vacc21 += va20 * vb01;
vacc21 += va21 * vb11;
vacc21 += va22 * vb21;
vacc21 += va23 * vb31;
vacc21 -= ((int32_t) vasum2) * vb_zero_point;
vacc22 += va20 * vb02;
vacc22 += va21 * vb12;
vacc22 += va22 * vb22;
vacc22 += va23 * vb32;
vacc22 -= ((int32_t) vasum2) * vb_zero_point;
vacc23 += va20 * vb03;
vacc23 += va21 * vb13;
vacc23 += va22 * vb23;
vacc23 += va23 * vb33;
vacc23 -= ((int32_t) vasum2) * vb_zero_point;
vacc30 += va30 * vb00;
vacc30 += va31 * vb10;
vacc30 += va32 * vb20;
vacc30 += va33 * vb30;
vacc30 -= ((int32_t) vasum3) * vb_zero_point;
vacc31 += va30 * vb01;
vacc31 += va31 * vb11;
vacc31 += va32 * vb21;
vacc31 += va33 * vb31;
vacc31 -= ((int32_t) vasum3) * vb_zero_point;
vacc32 += va30 * vb02;
vacc32 += va31 * vb12;
vacc32 += va32 * vb22;
vacc32 += va33 * vb32;
vacc32 -= ((int32_t) vasum3) * vb_zero_point;
vacc33 += va30 * vb03;
vacc33 += va31 * vb13;
vacc33 += va32 * vb23;
vacc33 += va33 * vb33;
vacc33 -= ((int32_t) vasum3) * vb_zero_point;
vacc40 += va40 * vb00;
vacc40 += va41 * vb10;
vacc40 += va42 * vb20;
vacc40 += va43 * vb30;
vacc40 -= ((int32_t) vasum4) * vb_zero_point;
vacc41 += va40 * vb01;
vacc41 += va41 * vb11;
vacc41 += va42 * vb21;
vacc41 += va43 * vb31;
vacc41 -= ((int32_t) vasum4) * vb_zero_point;
vacc42 += va40 * vb02;
vacc42 += va41 * vb12;
vacc42 += va42 * vb22;
vacc42 += va43 * vb32;
vacc42 -= ((int32_t) vasum4) * vb_zero_point;
vacc43 += va40 * vb03;
vacc43 += va41 * vb13;
vacc43 += va42 * vb23;
vacc43 += va43 * vb33;
vacc43 -= ((int32_t) vasum4) * vb_zero_point;
vacc50 += va50 * vb00;
vacc50 += va51 * vb10;
vacc50 += va52 * vb20;
vacc50 += va53 * vb30;
vacc50 -= ((int32_t) vasum5) * vb_zero_point;
vacc51 += va50 * vb01;
vacc51 += va51 * vb11;
vacc51 += va52 * vb21;
vacc51 += va53 * vb31;
vacc51 -= ((int32_t) vasum5) * vb_zero_point;
vacc52 += va50 * vb02;
vacc52 += va51 * vb12;
vacc52 += va52 * vb22;
vacc52 += va53 * vb32;
vacc52 -= ((int32_t) vasum5) * vb_zero_point;
vacc53 += va50 * vb03;
vacc53 += va51 * vb13;
vacc53 += va52 * vb23;
vacc53 += va53 * vb33;
vacc53 -= ((int32_t) vasum5) * vb_zero_point;
vacc60 += va60 * vb00;
vacc60 += va61 * vb10;
vacc60 += va62 * vb20;
vacc60 += va63 * vb30;
vacc60 -= ((int32_t) vasum6) * vb_zero_point;
vacc61 += va60 * vb01;
vacc61 += va61 * vb11;
vacc61 += va62 * vb21;
vacc61 += va63 * vb31;
vacc61 -= ((int32_t) vasum6) * vb_zero_point;
vacc62 += va60 * vb02;
vacc62 += va61 * vb12;
vacc62 += va62 * vb22;
vacc62 += va63 * vb32;
vacc62 -= ((int32_t) vasum6) * vb_zero_point;
vacc63 += va60 * vb03;
vacc63 += va61 * vb13;
vacc63 += va62 * vb23;
vacc63 += va63 * vb33;
vacc63 -= ((int32_t) vasum6) * vb_zero_point;
vacc70 += va70 * vb00;
vacc70 += va71 * vb10;
vacc70 += va72 * vb20;
vacc70 += va73 * vb30;
vacc70 -= ((int32_t) vasum7) * vb_zero_point;
vacc71 += va70 * vb01;
vacc71 += va71 * vb11;
vacc71 += va72 * vb21;
vacc71 += va73 * vb31;
vacc71 -= ((int32_t) vasum7) * vb_zero_point;
vacc72 += va70 * vb02;
vacc72 += va71 * vb12;
vacc72 += va72 * vb22;
vacc72 += va73 * vb32;
vacc72 -= ((int32_t) vasum7) * vb_zero_point;
vacc73 += va70 * vb03;
vacc73 += va71 * vb13;
vacc73 += va72 * vb23;
vacc73 += va73 * vb33;
vacc73 -= ((int32_t) vasum7) * vb_zero_point;
vacc80 += va80 * vb00;
vacc80 += va81 * vb10;
vacc80 += va82 * vb20;
vacc80 += va83 * vb30;
vacc80 -= ((int32_t) vasum8) * vb_zero_point;
vacc81 += va80 * vb01;
vacc81 += va81 * vb11;
vacc81 += va82 * vb21;
vacc81 += va83 * vb31;
vacc81 -= ((int32_t) vasum8) * vb_zero_point;
vacc82 += va80 * vb02;
vacc82 += va81 * vb12;
vacc82 += va82 * vb22;
vacc82 += va83 * vb32;
vacc82 -= ((int32_t) vasum8) * vb_zero_point;
vacc83 += va80 * vb03;
vacc83 += va81 * vb13;
vacc83 += va82 * vb23;
vacc83 += va83 * vb33;
vacc83 -= ((int32_t) vasum8) * vb_zero_point;
vacc90 += va90 * vb00;
vacc90 += va91 * vb10;
vacc90 += va92 * vb20;
vacc90 += va93 * vb30;
vacc90 -= ((int32_t) vasum9) * vb_zero_point;
vacc91 += va90 * vb01;
vacc91 += va91 * vb11;
vacc91 += va92 * vb21;
vacc91 += va93 * vb31;
vacc91 -= ((int32_t) vasum9) * vb_zero_point;
vacc92 += va90 * vb02;
vacc92 += va91 * vb12;
vacc92 += va92 * vb22;
vacc92 += va93 * vb32;
vacc92 -= ((int32_t) vasum9) * vb_zero_point;
vacc93 += va90 * vb03;
vacc93 += va91 * vb13;
vacc93 += va92 * vb23;
vacc93 += va93 * vb33;
vacc93 -= ((int32_t) vasum9) * vb_zero_point;
vacc100 += va100 * vb00;
vacc100 += va101 * vb10;
vacc100 += va102 * vb20;
vacc100 += va103 * vb30;
vacc100 -= ((int32_t) vasum10) * vb_zero_point;
vacc101 += va100 * vb01;
vacc101 += va101 * vb11;
vacc101 += va102 * vb21;
vacc101 += va103 * vb31;
vacc101 -= ((int32_t) vasum10) * vb_zero_point;
vacc102 += va100 * vb02;
vacc102 += va101 * vb12;
vacc102 += va102 * vb22;
vacc102 += va103 * vb32;
vacc102 -= ((int32_t) vasum10) * vb_zero_point;
vacc103 += va100 * vb03;
vacc103 += va101 * vb13;
vacc103 += va102 * vb23;
vacc103 += va103 * vb33;
vacc103 -= ((int32_t) vasum10) * vb_zero_point;
vacc110 += va110 * vb00;
vacc110 += va111 * vb10;
vacc110 += va112 * vb20;
vacc110 += va113 * vb30;
vacc110 -= ((int32_t) vasum11) * vb_zero_point;
vacc111 += va110 * vb01;
vacc111 += va111 * vb11;
vacc111 += va112 * vb21;
vacc111 += va113 * vb31;
vacc111 -= ((int32_t) vasum11) * vb_zero_point;
vacc112 += va110 * vb02;
vacc112 += va111 * vb12;
vacc112 += va112 * vb22;
vacc112 += va113 * vb32;
vacc112 -= ((int32_t) vasum11) * vb_zero_point;
vacc113 += va110 * vb03;
vacc113 += va111 * vb13;
vacc113 += va112 * vb23;
vacc113 += va113 * vb33;
vacc113 -= ((int32_t) vasum11) * vb_zero_point;
k += 4 * sizeof(uint8_t);
}
// End of accumulation loop. The variable `k` contains the amount by which
// we advanced the `va` pointers, so we rewind by this amount now.
a0 = (const uint8_t*)((uintptr_t) a0 - k);
a1 = (const uint8_t*)((uintptr_t) a1 - k);
a2 = (const uint8_t*)((uintptr_t) a2 - k);
a3 = (const uint8_t*)((uintptr_t) a3 - k);
a4 = (const uint8_t*)((uintptr_t) a4 - k);
a5 = (const uint8_t*)((uintptr_t) a5 - k);
a6 = (const uint8_t*)((uintptr_t) a6 - k);
a7 = (const uint8_t*)((uintptr_t) a7 - k);
a8 = (const uint8_t*)((uintptr_t) a8 - k);
a9 = (const uint8_t*)((uintptr_t) a9 - k);
a10 = (const uint8_t*)((uintptr_t) a10 - k);
a11 = (const uint8_t*)((uintptr_t) a11 - k);
// Post-accumulation work
const int32_t vmultiplier = params->scalar.multiplier;
const int64_t vq31rounding = INT64_C(0x40000000);
const int32_t vremainder_mask = params->scalar.remainder_mask;
const uint32_t vshift = params->scalar.shift;
const int32_t vremainder_threshold = params->scalar.remainder_threshold;
const int32_t voutput_min = params->scalar.output_min_less_zero_point;
const int32_t voutput_max = params->scalar.output_max_less_zero_point;
const int32_t voutput_zero_point = params->scalar.output_zero_point;
const int64_t vproduct00 = (int64_t)vacc00 * (int64_t)vmultiplier;
const int64_t vproduct01 = (int64_t)vacc01 * (int64_t)vmultiplier;
const int64_t vproduct02 = (int64_t)vacc02 * (int64_t)vmultiplier;
const int64_t vproduct03 = (int64_t)vacc03 * (int64_t)vmultiplier;
const int64_t vproduct10 = (int64_t)vacc10 * (int64_t)vmultiplier;
const int64_t vproduct11 = (int64_t)vacc11 * (int64_t)vmultiplier;
const int64_t vproduct12 = (int64_t)vacc12 * (int64_t)vmultiplier;
const int64_t vproduct13 = (int64_t)vacc13 * (int64_t)vmultiplier;
const int64_t vproduct20 = (int64_t)vacc20 * (int64_t)vmultiplier;
const int64_t vproduct21 = (int64_t)vacc21 * (int64_t)vmultiplier;
const int64_t vproduct22 = (int64_t)vacc22 * (int64_t)vmultiplier;
const int64_t vproduct23 = (int64_t)vacc23 * (int64_t)vmultiplier;
const int64_t vproduct30 = (int64_t)vacc30 * (int64_t)vmultiplier;
const int64_t vproduct31 = (int64_t)vacc31 * (int64_t)vmultiplier;
const int64_t vproduct32 = (int64_t)vacc32 * (int64_t)vmultiplier;
const int64_t vproduct33 = (int64_t)vacc33 * (int64_t)vmultiplier;
const int64_t vproduct40 = (int64_t)vacc40 * (int64_t)vmultiplier;
const int64_t vproduct41 = (int64_t)vacc41 * (int64_t)vmultiplier;
const int64_t vproduct42 = (int64_t)vacc42 * (int64_t)vmultiplier;
const int64_t vproduct43 = (int64_t)vacc43 * (int64_t)vmultiplier;
const int64_t vproduct50 = (int64_t)vacc50 * (int64_t)vmultiplier;
const int64_t vproduct51 = (int64_t)vacc51 * (int64_t)vmultiplier;
const int64_t vproduct52 = (int64_t)vacc52 * (int64_t)vmultiplier;
const int64_t vproduct53 = (int64_t)vacc53 * (int64_t)vmultiplier;
const int64_t vproduct60 = (int64_t)vacc60 * (int64_t)vmultiplier;
const int64_t vproduct61 = (int64_t)vacc61 * (int64_t)vmultiplier;
const int64_t vproduct62 = (int64_t)vacc62 * (int64_t)vmultiplier;
const int64_t vproduct63 = (int64_t)vacc63 * (int64_t)vmultiplier;
const int64_t vproduct70 = (int64_t)vacc70 * (int64_t)vmultiplier;
const int64_t vproduct71 = (int64_t)vacc71 * (int64_t)vmultiplier;
const int64_t vproduct72 = (int64_t)vacc72 * (int64_t)vmultiplier;
const int64_t vproduct73 = (int64_t)vacc73 * (int64_t)vmultiplier;
const int64_t vproduct80 = (int64_t)vacc80 * (int64_t)vmultiplier;
const int64_t vproduct81 = (int64_t)vacc81 * (int64_t)vmultiplier;
const int64_t vproduct82 = (int64_t)vacc82 * (int64_t)vmultiplier;
const int64_t vproduct83 = (int64_t)vacc83 * (int64_t)vmultiplier;
const int64_t vproduct90 = (int64_t)vacc90 * (int64_t)vmultiplier;
const int64_t vproduct91 = (int64_t)vacc91 * (int64_t)vmultiplier;
const int64_t vproduct92 = (int64_t)vacc92 * (int64_t)vmultiplier;
const int64_t vproduct93 = (int64_t)vacc93 * (int64_t)vmultiplier;
const int64_t vproduct100 = (int64_t)vacc100 * (int64_t)vmultiplier;
const int64_t vproduct101 = (int64_t)vacc101 * (int64_t)vmultiplier;
const int64_t vproduct102 = (int64_t)vacc102 * (int64_t)vmultiplier;
const int64_t vproduct103 = (int64_t)vacc103 * (int64_t)vmultiplier;
const int64_t vproduct110 = (int64_t)vacc110 * (int64_t)vmultiplier;
const int64_t vproduct111 = (int64_t)vacc111 * (int64_t)vmultiplier;
const int64_t vproduct112 = (int64_t)vacc112 * (int64_t)vmultiplier;
const int64_t vproduct113 = (int64_t)vacc113 * (int64_t)vmultiplier;
const int32_t vq31product00 = (int32_t)(uint32_t)((uint64_t)(vproduct00 + vq31rounding) >> 31);
const int32_t vq31product01 = (int32_t)(uint32_t)((uint64_t)(vproduct01 + vq31rounding) >> 31);
const int32_t vq31product02 = (int32_t)(uint32_t)((uint64_t)(vproduct02 + vq31rounding) >> 31);
const int32_t vq31product03 = (int32_t)(uint32_t)((uint64_t)(vproduct03 + vq31rounding) >> 31);
const int32_t vq31product10 = (int32_t)(uint32_t)((uint64_t)(vproduct10 + vq31rounding) >> 31);
const int32_t vq31product11 = (int32_t)(uint32_t)((uint64_t)(vproduct11 + vq31rounding) >> 31);
const int32_t vq31product12 = (int32_t)(uint32_t)((uint64_t)(vproduct12 + vq31rounding) >> 31);
const int32_t vq31product13 = (int32_t)(uint32_t)((uint64_t)(vproduct13 + vq31rounding) >> 31);
const int32_t vq31product20 = (int32_t)(uint32_t)((uint64_t)(vproduct20 + vq31rounding) >> 31);
const int32_t vq31product21 = (int32_t)(uint32_t)((uint64_t)(vproduct21 + vq31rounding) >> 31);
const int32_t vq31product22 = (int32_t)(uint32_t)((uint64_t)(vproduct22 + vq31rounding) >> 31);
const int32_t vq31product23 = (int32_t)(uint32_t)((uint64_t)(vproduct23 + vq31rounding) >> 31);
const int32_t vq31product30 = (int32_t)(uint32_t)((uint64_t)(vproduct30 + vq31rounding) >> 31);
const int32_t vq31product31 = (int32_t)(uint32_t)((uint64_t)(vproduct31 + vq31rounding) >> 31);
const int32_t vq31product32 = (int32_t)(uint32_t)((uint64_t)(vproduct32 + vq31rounding) >> 31);
const int32_t vq31product33 = (int32_t)(uint32_t)((uint64_t)(vproduct33 + vq31rounding) >> 31);
const int32_t vq31product40 = (int32_t)(uint32_t)((uint64_t)(vproduct40 + vq31rounding) >> 31);
const int32_t vq31product41 = (int32_t)(uint32_t)((uint64_t)(vproduct41 + vq31rounding) >> 31);
const int32_t vq31product42 = (int32_t)(uint32_t)((uint64_t)(vproduct42 + vq31rounding) >> 31);
const int32_t vq31product43 = (int32_t)(uint32_t)((uint64_t)(vproduct43 + vq31rounding) >> 31);
const int32_t vq31product50 = (int32_t)(uint32_t)((uint64_t)(vproduct50 + vq31rounding) >> 31);
const int32_t vq31product51 = (int32_t)(uint32_t)((uint64_t)(vproduct51 + vq31rounding) >> 31);
const int32_t vq31product52 = (int32_t)(uint32_t)((uint64_t)(vproduct52 + vq31rounding) >> 31);
const int32_t vq31product53 = (int32_t)(uint32_t)((uint64_t)(vproduct53 + vq31rounding) >> 31);
const int32_t vq31product60 = (int32_t)(uint32_t)((uint64_t)(vproduct60 + vq31rounding) >> 31);
const int32_t vq31product61 = (int32_t)(uint32_t)((uint64_t)(vproduct61 + vq31rounding) >> 31);
const int32_t vq31product62 = (int32_t)(uint32_t)((uint64_t)(vproduct62 + vq31rounding) >> 31);
const int32_t vq31product63 = (int32_t)(uint32_t)((uint64_t)(vproduct63 + vq31rounding) >> 31);
const int32_t vq31product70 = (int32_t)(uint32_t)((uint64_t)(vproduct70 + vq31rounding) >> 31);
const int32_t vq31product71 = (int32_t)(uint32_t)((uint64_t)(vproduct71 + vq31rounding) >> 31);
const int32_t vq31product72 = (int32_t)(uint32_t)((uint64_t)(vproduct72 + vq31rounding) >> 31);
const int32_t vq31product73 = (int32_t)(uint32_t)((uint64_t)(vproduct73 + vq31rounding) >> 31);
const int32_t vq31product80 = (int32_t)(uint32_t)((uint64_t)(vproduct80 + vq31rounding) >> 31);
const int32_t vq31product81 = (int32_t)(uint32_t)((uint64_t)(vproduct81 + vq31rounding) >> 31);
const int32_t vq31product82 = (int32_t)(uint32_t)((uint64_t)(vproduct82 + vq31rounding) >> 31);
const int32_t vq31product83 = (int32_t)(uint32_t)((uint64_t)(vproduct83 + vq31rounding) >> 31);
const int32_t vq31product90 = (int32_t)(uint32_t)((uint64_t)(vproduct90 + vq31rounding) >> 31);
const int32_t vq31product91 = (int32_t)(uint32_t)((uint64_t)(vproduct91 + vq31rounding) >> 31);
const int32_t vq31product92 = (int32_t)(uint32_t)((uint64_t)(vproduct92 + vq31rounding) >> 31);
const int32_t vq31product93 = (int32_t)(uint32_t)((uint64_t)(vproduct93 + vq31rounding) >> 31);
const int32_t vq31product100 = (int32_t)(uint32_t)((uint64_t)(vproduct100 + vq31rounding) >> 31);
const int32_t vq31product101 = (int32_t)(uint32_t)((uint64_t)(vproduct101 + vq31rounding) >> 31);
const int32_t vq31product102 = (int32_t)(uint32_t)((uint64_t)(vproduct102 + vq31rounding) >> 31);
const int32_t vq31product103 = (int32_t)(uint32_t)((uint64_t)(vproduct103 + vq31rounding) >> 31);
const int32_t vq31product110 = (int32_t)(uint32_t)((uint64_t)(vproduct110 + vq31rounding) >> 31);
const int32_t vq31product111 = (int32_t)(uint32_t)((uint64_t)(vproduct111 + vq31rounding) >> 31);
const int32_t vq31product112 = (int32_t)(uint32_t)((uint64_t)(vproduct112 + vq31rounding) >> 31);
const int32_t vq31product113 = (int32_t)(uint32_t)((uint64_t)(vproduct113 + vq31rounding) >> 31);
const int32_t vremainder00 = (vq31product00 & vremainder_mask) - (int32_t)(vq31product00 < 0);
const int32_t vremainder01 = (vq31product01 & vremainder_mask) - (int32_t)(vq31product01 < 0);
const int32_t vremainder02 = (vq31product02 & vremainder_mask) - (int32_t)(vq31product02 < 0);
const int32_t vremainder03 = (vq31product03 & vremainder_mask) - (int32_t)(vq31product03 < 0);
const int32_t vremainder10 = (vq31product10 & vremainder_mask) - (int32_t)(vq31product10 < 0);
const int32_t vremainder11 = (vq31product11 & vremainder_mask) - (int32_t)(vq31product11 < 0);
const int32_t vremainder12 = (vq31product12 & vremainder_mask) - (int32_t)(vq31product12 < 0);
const int32_t vremainder13 = (vq31product13 & vremainder_mask) - (int32_t)(vq31product13 < 0);
const int32_t vremainder20 = (vq31product20 & vremainder_mask) - (int32_t)(vq31product20 < 0);
const int32_t vremainder21 = (vq31product21 & vremainder_mask) - (int32_t)(vq31product21 < 0);
const int32_t vremainder22 = (vq31product22 & vremainder_mask) - (int32_t)(vq31product22 < 0);
const int32_t vremainder23 = (vq31product23 & vremainder_mask) - (int32_t)(vq31product23 < 0);
const int32_t vremainder30 = (vq31product30 & vremainder_mask) - (int32_t)(vq31product30 < 0);
const int32_t vremainder31 = (vq31product31 & vremainder_mask) - (int32_t)(vq31product31 < 0);
const int32_t vremainder32 = (vq31product32 & vremainder_mask) - (int32_t)(vq31product32 < 0);
const int32_t vremainder33 = (vq31product33 & vremainder_mask) - (int32_t)(vq31product33 < 0);
const int32_t vremainder40 = (vq31product40 & vremainder_mask) - (int32_t)(vq31product40 < 0);
const int32_t vremainder41 = (vq31product41 & vremainder_mask) - (int32_t)(vq31product41 < 0);
const int32_t vremainder42 = (vq31product42 & vremainder_mask) - (int32_t)(vq31product42 < 0);
const int32_t vremainder43 = (vq31product43 & vremainder_mask) - (int32_t)(vq31product43 < 0);
const int32_t vremainder50 = (vq31product50 & vremainder_mask) - (int32_t)(vq31product50 < 0);
const int32_t vremainder51 = (vq31product51 & vremainder_mask) - (int32_t)(vq31product51 < 0);
const int32_t vremainder52 = (vq31product52 & vremainder_mask) - (int32_t)(vq31product52 < 0);
const int32_t vremainder53 = (vq31product53 & vremainder_mask) - (int32_t)(vq31product53 < 0);
const int32_t vremainder60 = (vq31product60 & vremainder_mask) - (int32_t)(vq31product60 < 0);
const int32_t vremainder61 = (vq31product61 & vremainder_mask) - (int32_t)(vq31product61 < 0);
const int32_t vremainder62 = (vq31product62 & vremainder_mask) - (int32_t)(vq31product62 < 0);
const int32_t vremainder63 = (vq31product63 & vremainder_mask) - (int32_t)(vq31product63 < 0);
const int32_t vremainder70 = (vq31product70 & vremainder_mask) - (int32_t)(vq31product70 < 0);
const int32_t vremainder71 = (vq31product71 & vremainder_mask) - (int32_t)(vq31product71 < 0);
const int32_t vremainder72 = (vq31product72 & vremainder_mask) - (int32_t)(vq31product72 < 0);
const int32_t vremainder73 = (vq31product73 & vremainder_mask) - (int32_t)(vq31product73 < 0);
const int32_t vremainder80 = (vq31product80 & vremainder_mask) - (int32_t)(vq31product80 < 0);
const int32_t vremainder81 = (vq31product81 & vremainder_mask) - (int32_t)(vq31product81 < 0);
const int32_t vremainder82 = (vq31product82 & vremainder_mask) - (int32_t)(vq31product82 < 0);
const int32_t vremainder83 = (vq31product83 & vremainder_mask) - (int32_t)(vq31product83 < 0);
const int32_t vremainder90 = (vq31product90 & vremainder_mask) - (int32_t)(vq31product90 < 0);
const int32_t vremainder91 = (vq31product91 & vremainder_mask) - (int32_t)(vq31product91 < 0);
const int32_t vremainder92 = (vq31product92 & vremainder_mask) - (int32_t)(vq31product92 < 0);
const int32_t vremainder93 = (vq31product93 & vremainder_mask) - (int32_t)(vq31product93 < 0);
const int32_t vremainder100 = (vq31product100 & vremainder_mask) - (int32_t)(vq31product100 < 0);
const int32_t vremainder101 = (vq31product101 & vremainder_mask) - (int32_t)(vq31product101 < 0);
const int32_t vremainder102 = (vq31product102 & vremainder_mask) - (int32_t)(vq31product102 < 0);
const int32_t vremainder103 = (vq31product103 & vremainder_mask) - (int32_t)(vq31product103 < 0);
const int32_t vremainder110 = (vq31product110 & vremainder_mask) - (int32_t)(vq31product110 < 0);
const int32_t vremainder111 = (vq31product111 & vremainder_mask) - (int32_t)(vq31product111 < 0);
const int32_t vremainder112 = (vq31product112 & vremainder_mask) - (int32_t)(vq31product112 < 0);
const int32_t vremainder113 = (vq31product113 & vremainder_mask) - (int32_t)(vq31product113 < 0);
int32_t vout00 = asr_s32(vq31product00, vshift) + (int32_t)(vremainder00 > vremainder_threshold);
int32_t vout01 = asr_s32(vq31product01, vshift) + (int32_t)(vremainder01 > vremainder_threshold);
int32_t vout02 = asr_s32(vq31product02, vshift) + (int32_t)(vremainder02 > vremainder_threshold);
int32_t vout03 = asr_s32(vq31product03, vshift) + (int32_t)(vremainder03 > vremainder_threshold);
int32_t vout10 = asr_s32(vq31product10, vshift) + (int32_t)(vremainder10 > vremainder_threshold);
int32_t vout11 = asr_s32(vq31product11, vshift) + (int32_t)(vremainder11 > vremainder_threshold);
int32_t vout12 = asr_s32(vq31product12, vshift) + (int32_t)(vremainder12 > vremainder_threshold);
int32_t vout13 = asr_s32(vq31product13, vshift) + (int32_t)(vremainder13 > vremainder_threshold);
int32_t vout20 = asr_s32(vq31product20, vshift) + (int32_t)(vremainder20 > vremainder_threshold);
int32_t vout21 = asr_s32(vq31product21, vshift) + (int32_t)(vremainder21 > vremainder_threshold);
int32_t vout22 = asr_s32(vq31product22, vshift) + (int32_t)(vremainder22 > vremainder_threshold);
int32_t vout23 = asr_s32(vq31product23, vshift) + (int32_t)(vremainder23 > vremainder_threshold);
int32_t vout30 = asr_s32(vq31product30, vshift) + (int32_t)(vremainder30 > vremainder_threshold);
int32_t vout31 = asr_s32(vq31product31, vshift) + (int32_t)(vremainder31 > vremainder_threshold);
int32_t vout32 = asr_s32(vq31product32, vshift) + (int32_t)(vremainder32 > vremainder_threshold);
int32_t vout33 = asr_s32(vq31product33, vshift) + (int32_t)(vremainder33 > vremainder_threshold);
int32_t vout40 = asr_s32(vq31product40, vshift) + (int32_t)(vremainder40 > vremainder_threshold);
int32_t vout41 = asr_s32(vq31product41, vshift) + (int32_t)(vremainder41 > vremainder_threshold);
int32_t vout42 = asr_s32(vq31product42, vshift) + (int32_t)(vremainder42 > vremainder_threshold);
int32_t vout43 = asr_s32(vq31product43, vshift) + (int32_t)(vremainder43 > vremainder_threshold);
int32_t vout50 = asr_s32(vq31product50, vshift) + (int32_t)(vremainder50 > vremainder_threshold);
int32_t vout51 = asr_s32(vq31product51, vshift) + (int32_t)(vremainder51 > vremainder_threshold);
int32_t vout52 = asr_s32(vq31product52, vshift) + (int32_t)(vremainder52 > vremainder_threshold);
int32_t vout53 = asr_s32(vq31product53, vshift) + (int32_t)(vremainder53 > vremainder_threshold);
int32_t vout60 = asr_s32(vq31product60, vshift) + (int32_t)(vremainder60 > vremainder_threshold);
int32_t vout61 = asr_s32(vq31product61, vshift) + (int32_t)(vremainder61 > vremainder_threshold);
int32_t vout62 = asr_s32(vq31product62, vshift) + (int32_t)(vremainder62 > vremainder_threshold);
int32_t vout63 = asr_s32(vq31product63, vshift) + (int32_t)(vremainder63 > vremainder_threshold);
int32_t vout70 = asr_s32(vq31product70, vshift) + (int32_t)(vremainder70 > vremainder_threshold);
int32_t vout71 = asr_s32(vq31product71, vshift) + (int32_t)(vremainder71 > vremainder_threshold);
int32_t vout72 = asr_s32(vq31product72, vshift) + (int32_t)(vremainder72 > vremainder_threshold);
int32_t vout73 = asr_s32(vq31product73, vshift) + (int32_t)(vremainder73 > vremainder_threshold);
int32_t vout80 = asr_s32(vq31product80, vshift) + (int32_t)(vremainder80 > vremainder_threshold);
int32_t vout81 = asr_s32(vq31product81, vshift) + (int32_t)(vremainder81 > vremainder_threshold);
int32_t vout82 = asr_s32(vq31product82, vshift) + (int32_t)(vremainder82 > vremainder_threshold);
int32_t vout83 = asr_s32(vq31product83, vshift) + (int32_t)(vremainder83 > vremainder_threshold);
int32_t vout90 = asr_s32(vq31product90, vshift) + (int32_t)(vremainder90 > vremainder_threshold);
int32_t vout91 = asr_s32(vq31product91, vshift) + (int32_t)(vremainder91 > vremainder_threshold);
int32_t vout92 = asr_s32(vq31product92, vshift) + (int32_t)(vremainder92 > vremainder_threshold);
int32_t vout93 = asr_s32(vq31product93, vshift) + (int32_t)(vremainder93 > vremainder_threshold);
int32_t vout100 = asr_s32(vq31product100, vshift) + (int32_t)(vremainder100 > vremainder_threshold);
int32_t vout101 = asr_s32(vq31product101, vshift) + (int32_t)(vremainder101 > vremainder_threshold);
int32_t vout102 = asr_s32(vq31product102, vshift) + (int32_t)(vremainder102 > vremainder_threshold);
int32_t vout103 = asr_s32(vq31product103, vshift) + (int32_t)(vremainder103 > vremainder_threshold);
int32_t vout110 = asr_s32(vq31product110, vshift) + (int32_t)(vremainder110 > vremainder_threshold);
int32_t vout111 = asr_s32(vq31product111, vshift) + (int32_t)(vremainder111 > vremainder_threshold);
int32_t vout112 = asr_s32(vq31product112, vshift) + (int32_t)(vremainder112 > vremainder_threshold);
int32_t vout113 = asr_s32(vq31product113, vshift) + (int32_t)(vremainder113 > vremainder_threshold);
vout00 = vout00 < voutput_min ? voutput_min : vout00;
vout01 = vout01 < voutput_min ? voutput_min : vout01;
vout02 = vout02 < voutput_min ? voutput_min : vout02;
vout03 = vout03 < voutput_min ? voutput_min : vout03;
vout10 = vout10 < voutput_min ? voutput_min : vout10;
vout11 = vout11 < voutput_min ? voutput_min : vout11;
vout12 = vout12 < voutput_min ? voutput_min : vout12;
vout13 = vout13 < voutput_min ? voutput_min : vout13;
vout20 = vout20 < voutput_min ? voutput_min : vout20;
vout21 = vout21 < voutput_min ? voutput_min : vout21;
vout22 = vout22 < voutput_min ? voutput_min : vout22;
vout23 = vout23 < voutput_min ? voutput_min : vout23;
vout30 = vout30 < voutput_min ? voutput_min : vout30;
vout31 = vout31 < voutput_min ? voutput_min : vout31;
vout32 = vout32 < voutput_min ? voutput_min : vout32;
vout33 = vout33 < voutput_min ? voutput_min : vout33;
vout40 = vout40 < voutput_min ? voutput_min : vout40;
vout41 = vout41 < voutput_min ? voutput_min : vout41;
vout42 = vout42 < voutput_min ? voutput_min : vout42;
vout43 = vout43 < voutput_min ? voutput_min : vout43;
vout50 = vout50 < voutput_min ? voutput_min : vout50;
vout51 = vout51 < voutput_min ? voutput_min : vout51;
vout52 = vout52 < voutput_min ? voutput_min : vout52;
vout53 = vout53 < voutput_min ? voutput_min : vout53;
vout60 = vout60 < voutput_min ? voutput_min : vout60;
vout61 = vout61 < voutput_min ? voutput_min : vout61;
vout62 = vout62 < voutput_min ? voutput_min : vout62;
vout63 = vout63 < voutput_min ? voutput_min : vout63;
vout70 = vout70 < voutput_min ? voutput_min : vout70;
vout71 = vout71 < voutput_min ? voutput_min : vout71;
vout72 = vout72 < voutput_min ? voutput_min : vout72;
vout73 = vout73 < voutput_min ? voutput_min : vout73;
vout80 = vout80 < voutput_min ? voutput_min : vout80;
vout81 = vout81 < voutput_min ? voutput_min : vout81;
vout82 = vout82 < voutput_min ? voutput_min : vout82;
vout83 = vout83 < voutput_min ? voutput_min : vout83;
vout90 = vout90 < voutput_min ? voutput_min : vout90;
vout91 = vout91 < voutput_min ? voutput_min : vout91;
vout92 = vout92 < voutput_min ? voutput_min : vout92;
vout93 = vout93 < voutput_min ? voutput_min : vout93;
vout100 = vout100 < voutput_min ? voutput_min : vout100;
vout101 = vout101 < voutput_min ? voutput_min : vout101;
vout102 = vout102 < voutput_min ? voutput_min : vout102;
vout103 = vout103 < voutput_min ? voutput_min : vout103;
vout110 = vout110 < voutput_min ? voutput_min : vout110;
vout111 = vout111 < voutput_min ? voutput_min : vout111;
vout112 = vout112 < voutput_min ? voutput_min : vout112;
vout113 = vout113 < voutput_min ? voutput_min : vout113;
vout00 = vout00 > voutput_max ? voutput_max : vout00;
vout01 = vout01 > voutput_max ? voutput_max : vout01;
vout02 = vout02 > voutput_max ? voutput_max : vout02;
vout03 = vout03 > voutput_max ? voutput_max : vout03;
vout10 = vout10 > voutput_max ? voutput_max : vout10;
vout11 = vout11 > voutput_max ? voutput_max : vout11;
vout12 = vout12 > voutput_max ? voutput_max : vout12;
vout13 = vout13 > voutput_max ? voutput_max : vout13;
vout20 = vout20 > voutput_max ? voutput_max : vout20;
vout21 = vout21 > voutput_max ? voutput_max : vout21;
vout22 = vout22 > voutput_max ? voutput_max : vout22;
vout23 = vout23 > voutput_max ? voutput_max : vout23;
vout30 = vout30 > voutput_max ? voutput_max : vout30;
vout31 = vout31 > voutput_max ? voutput_max : vout31;
vout32 = vout32 > voutput_max ? voutput_max : vout32;
vout33 = vout33 > voutput_max ? voutput_max : vout33;
vout40 = vout40 > voutput_max ? voutput_max : vout40;
vout41 = vout41 > voutput_max ? voutput_max : vout41;
vout42 = vout42 > voutput_max ? voutput_max : vout42;
vout43 = vout43 > voutput_max ? voutput_max : vout43;
vout50 = vout50 > voutput_max ? voutput_max : vout50;
vout51 = vout51 > voutput_max ? voutput_max : vout51;
vout52 = vout52 > voutput_max ? voutput_max : vout52;
vout53 = vout53 > voutput_max ? voutput_max : vout53;
vout60 = vout60 > voutput_max ? voutput_max : vout60;
vout61 = vout61 > voutput_max ? voutput_max : vout61;
vout62 = vout62 > voutput_max ? voutput_max : vout62;
vout63 = vout63 > voutput_max ? voutput_max : vout63;
vout70 = vout70 > voutput_max ? voutput_max : vout70;
vout71 = vout71 > voutput_max ? voutput_max : vout71;
vout72 = vout72 > voutput_max ? voutput_max : vout72;
vout73 = vout73 > voutput_max ? voutput_max : vout73;
vout80 = vout80 > voutput_max ? voutput_max : vout80;
vout81 = vout81 > voutput_max ? voutput_max : vout81;
vout82 = vout82 > voutput_max ? voutput_max : vout82;
vout83 = vout83 > voutput_max ? voutput_max : vout83;
vout90 = vout90 > voutput_max ? voutput_max : vout90;
vout91 = vout91 > voutput_max ? voutput_max : vout91;
vout92 = vout92 > voutput_max ? voutput_max : vout92;
vout93 = vout93 > voutput_max ? voutput_max : vout93;
vout100 = vout100 > voutput_max ? voutput_max : vout100;
vout101 = vout101 > voutput_max ? voutput_max : vout101;
vout102 = vout102 > voutput_max ? voutput_max : vout102;
vout103 = vout103 > voutput_max ? voutput_max : vout103;
vout110 = vout110 > voutput_max ? voutput_max : vout110;
vout111 = vout111 > voutput_max ? voutput_max : vout111;
vout112 = vout112 > voutput_max ? voutput_max : vout112;
vout113 = vout113 > voutput_max ? voutput_max : vout113;
vout00 += voutput_zero_point;
vout01 += voutput_zero_point;
vout02 += voutput_zero_point;
vout03 += voutput_zero_point;
vout10 += voutput_zero_point;
vout11 += voutput_zero_point;
vout12 += voutput_zero_point;
vout13 += voutput_zero_point;
vout20 += voutput_zero_point;
vout21 += voutput_zero_point;
vout22 += voutput_zero_point;
vout23 += voutput_zero_point;
vout30 += voutput_zero_point;
vout31 += voutput_zero_point;
vout32 += voutput_zero_point;
vout33 += voutput_zero_point;
vout40 += voutput_zero_point;
vout41 += voutput_zero_point;
vout42 += voutput_zero_point;
vout43 += voutput_zero_point;
vout50 += voutput_zero_point;
vout51 += voutput_zero_point;
vout52 += voutput_zero_point;
vout53 += voutput_zero_point;
vout60 += voutput_zero_point;
vout61 += voutput_zero_point;
vout62 += voutput_zero_point;
vout63 += voutput_zero_point;
vout70 += voutput_zero_point;
vout71 += voutput_zero_point;
vout72 += voutput_zero_point;
vout73 += voutput_zero_point;
vout80 += voutput_zero_point;
vout81 += voutput_zero_point;
vout82 += voutput_zero_point;
vout83 += voutput_zero_point;
vout90 += voutput_zero_point;
vout91 += voutput_zero_point;
vout92 += voutput_zero_point;
vout93 += voutput_zero_point;
vout100 += voutput_zero_point;
vout101 += voutput_zero_point;
vout102 += voutput_zero_point;
vout103 += voutput_zero_point;
vout110 += voutput_zero_point;
vout111 += voutput_zero_point;
vout112 += voutput_zero_point;
vout113 += voutput_zero_point;
if XNN_LIKELY (nc >= 4) {
// Main case where there the 4 columns fit in the destination.
c0[0] = vout00;
c0[1] = vout01;
c0[2] = vout02;
c0[3] = vout03;
c1[0] = vout10;
c1[1] = vout11;
c1[2] = vout12;
c1[3] = vout13;
c2[0] = vout20;
c2[1] = vout21;
c2[2] = vout22;
c2[3] = vout23;
c3[0] = vout30;
c3[1] = vout31;
c3[2] = vout32;
c3[3] = vout33;
c4[0] = vout40;
c4[1] = vout41;
c4[2] = vout42;
c4[3] = vout43;
c5[0] = vout50;
c5[1] = vout51;
c5[2] = vout52;
c5[3] = vout53;
c6[0] = vout60;
c6[1] = vout61;
c6[2] = vout62;
c6[3] = vout63;
c7[0] = vout70;
c7[1] = vout71;
c7[2] = vout72;
c7[3] = vout73;
c8[0] = vout80;
c8[1] = vout81;
c8[2] = vout82;
c8[3] = vout83;
c9[0] = vout90;
c9[1] = vout91;
c9[2] = vout92;
c9[3] = vout93;
c10[0] = vout100;
c10[1] = vout101;
c10[2] = vout102;
c10[3] = vout103;
c11[0] = vout110;
c11[1] = vout111;
c11[2] = vout112;
c11[3] = vout113;
// Advance to the next 4 columns.
c0 = (uint8_t*)((uintptr_t) c0 + cn_stride);
c1 = (uint8_t*)((uintptr_t) c1 + cn_stride);
c2 = (uint8_t*)((uintptr_t) c2 + cn_stride);
c3 = (uint8_t*)((uintptr_t) c3 + cn_stride);
c4 = (uint8_t*)((uintptr_t) c4 + cn_stride);
c5 = (uint8_t*)((uintptr_t) c5 + cn_stride);
c6 = (uint8_t*)((uintptr_t) c6 + cn_stride);
c7 = (uint8_t*)((uintptr_t) c7 + cn_stride);
c8 = (uint8_t*)((uintptr_t) c8 + cn_stride);
c9 = (uint8_t*)((uintptr_t) c9 + cn_stride);
c10 = (uint8_t*)((uintptr_t) c10 + cn_stride);
c11 = (uint8_t*)((uintptr_t) c11 + cn_stride);
nc -= 4;
} else {
// Final case where not all of the 4 columns fit in the destination.
if (nc > 0) {
c0[0] = vout00;
c1[0] = vout10;
c2[0] = vout20;
c3[0] = vout30;
c4[0] = vout40;
c5[0] = vout50;
c6[0] = vout60;
c7[0] = vout70;
c8[0] = vout80;
c9[0] = vout90;
c10[0] = vout100;
c11[0] = vout110;
}
if (nc > 1) {
c0[1] = vout01;
c1[1] = vout11;
c2[1] = vout21;
c3[1] = vout31;
c4[1] = vout41;
c5[1] = vout51;
c6[1] = vout61;
c7[1] = vout71;
c8[1] = vout81;
c9[1] = vout91;
c10[1] = vout101;
c11[1] = vout111;
}
if (nc > 2) {
c0[2] = vout02;
c1[2] = vout12;
c2[2] = vout22;
c3[2] = vout32;
c4[2] = vout42;
c5[2] = vout52;
c6[2] = vout62;
c7[2] = vout72;
c8[2] = vout82;
c9[2] = vout92;
c10[2] = vout102;
c11[2] = vout112;
}
if (nc > 3) {
c0[3] = vout03;
c1[3] = vout13;
c2[3] = vout23;
c3[3] = vout33;
c4[3] = vout43;
c5[3] = vout53;
c6[3] = vout63;
c7[3] = vout73;
c8[3] = vout83;
c9[3] = vout93;
c10[3] = vout103;
c11[3] = vout113;
}
nc = 0;
}
} while (nc != 0);
}