WAsm Relaxed SIMD QS8/QU8 VCVT & VLRELU microkernels
PiperOrigin-RevId: 468049046
diff --git a/BUILD.bazel b/BUILD.bazel
index e0f4b5c..4a8e89a 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -2926,6 +2926,22 @@
"src/f32-vmulcaddc/gen/c4-minmax-wasmrelaxedsimd-2x.c",
"src/f32-vmulcaddc/gen/c8-minmax-wasmrelaxedsimd-fma-2x.c",
"src/f32-vmulcaddc/gen/c8-minmax-wasmrelaxedsimd-2x.c",
+ "src/qs8-vcvt/gen/vcvt-wasmrelaxedsimd-x8.c",
+ "src/qs8-vcvt/gen/vcvt-wasmrelaxedsimd-x16.c",
+ "src/qs8-vcvt/gen/vcvt-wasmrelaxedsimd-x32.c",
+ "src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-arm-x16.c",
+ "src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-arm-x32.c",
+ "src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x8.c",
+ "src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x16.c",
+ "src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x32.c",
+ "src/qu8-vcvt/gen/vcvt-wasmrelaxedsimd-x8.c",
+ "src/qu8-vcvt/gen/vcvt-wasmrelaxedsimd-x16.c",
+ "src/qu8-vcvt/gen/vcvt-wasmrelaxedsimd-x32.c",
+ "src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-arm-x16.c",
+ "src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-arm-x32.c",
+ "src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x8.c",
+ "src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x16.c",
+ "src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x32.c",
]
PROD_ARMV6SIMD_MICROKERNEL_SRCS = [
diff --git a/bench/qs8-vcvt.cc b/bench/qs8-vcvt.cc
index 187933b..4f163fa 100644
--- a/bench/qs8-vcvt.cc
+++ b/bench/qs8-vcvt.cc
@@ -182,6 +182,24 @@
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMRELAXEDSIMD
+ BENCHMARK_CAPTURE(qs8_vcvt, wasmrelaxedsimd_x8,
+ xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x8,
+ xnn_init_qs8_cvt_wasmsimd_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(qs8_vcvt, wasmrelaxedsimd_x16,
+ xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x16,
+ xnn_init_qs8_cvt_wasmsimd_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(qs8_vcvt, wasmrelaxedsimd_x32,
+ xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x32,
+ xnn_init_qs8_cvt_wasmsimd_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
+ ->UseRealTime();
+#endif // XNN_ARCH_WASMRELAXEDSIMD
+
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
BENCHMARK_CAPTURE(qs8_vcvt, wasmsimd_x8,
xnn_qs8_vcvt_ukernel__wasmsimd_x8,
diff --git a/bench/qs8-vlrelu.cc b/bench/qs8-vlrelu.cc
index 8958083..bac6f96 100644
--- a/bench/qs8-vlrelu.cc
+++ b/bench/qs8-vlrelu.cc
@@ -182,6 +182,35 @@
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMRELAXEDSIMD
+ BENCHMARK_CAPTURE(qs8_vlrelu, wasmrelaxedsimd_arm_x16,
+ xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16,
+ xnn_init_qs8_lrelu_wasmsimd_arm_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(qs8_vlrelu, wasmrelaxedsimd_arm_x32,
+ xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32,
+ xnn_init_qs8_lrelu_wasmsimd_arm_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
+ ->UseRealTime();
+
+ BENCHMARK_CAPTURE(qs8_vlrelu, wasmrelaxedsimd_x86_x8,
+ xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8,
+ xnn_init_qs8_lrelu_wasmsimd_x86_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(qs8_vlrelu, wasmrelaxedsimd_x86_x16,
+ xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16,
+ xnn_init_qs8_lrelu_wasmsimd_x86_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(qs8_vlrelu, wasmrelaxedsimd_x86_x32,
+ xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32,
+ xnn_init_qs8_lrelu_wasmsimd_x86_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
+ ->UseRealTime();
+#endif // XNN_ARCH_WASMRELAXEDSIMD
+
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
BENCHMARK_CAPTURE(qs8_vlrelu, wasmsimd_arm_x16,
xnn_qs8_vlrelu_ukernel__wasmsimd_arm_x16,
diff --git a/bench/qu8-vcvt.cc b/bench/qu8-vcvt.cc
index 2f7b3dd..5d6ea53 100644
--- a/bench/qu8-vcvt.cc
+++ b/bench/qu8-vcvt.cc
@@ -182,6 +182,24 @@
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMRELAXEDSIMD
+ BENCHMARK_CAPTURE(qu8_vcvt, wasmrelaxedsimd_x8,
+ xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x8,
+ xnn_init_qu8_cvt_wasmsimd_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(qu8_vcvt, wasmrelaxedsimd_x16,
+ xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x16,
+ xnn_init_qu8_cvt_wasmsimd_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(qu8_vcvt, wasmrelaxedsimd_x32,
+ xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x32,
+ xnn_init_qu8_cvt_wasmsimd_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+ ->UseRealTime();
+#endif // XNN_ARCH_WASMRELAXEDSIMD
+
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
BENCHMARK_CAPTURE(qu8_vcvt, wasmsimd_x8,
xnn_qu8_vcvt_ukernel__wasmsimd_x8,
diff --git a/bench/qu8-vlrelu.cc b/bench/qu8-vlrelu.cc
index 643119a..fca424d 100644
--- a/bench/qu8-vlrelu.cc
+++ b/bench/qu8-vlrelu.cc
@@ -182,6 +182,35 @@
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMRELAXEDSIMD
+ BENCHMARK_CAPTURE(qu8_vlrelu, wasmrelaxedsimd_arm_x16,
+ xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16,
+ xnn_init_qu8_lrelu_wasmsimd_arm_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(qu8_vlrelu, wasmrelaxedsimd_arm_x32,
+ xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32,
+ xnn_init_qu8_lrelu_wasmsimd_arm_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+ ->UseRealTime();
+
+ BENCHMARK_CAPTURE(qu8_vlrelu, wasmrelaxedsimd_x86_x8,
+ xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8,
+ xnn_init_qu8_lrelu_wasmsimd_x86_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(qu8_vlrelu, wasmrelaxedsimd_x86_x16,
+ xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16,
+ xnn_init_qu8_lrelu_wasmsimd_x86_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(qu8_vlrelu, wasmrelaxedsimd_x86_x32,
+ xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32,
+ xnn_init_qu8_lrelu_wasmsimd_x86_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+ ->UseRealTime();
+#endif // XNN_ARCH_WASMRELAXEDSIMD
+
#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
BENCHMARK_CAPTURE(qu8_vlrelu, wasmsimd_arm_x16,
xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x16,
diff --git a/scripts/generate-qs8-vcvt.sh b/scripts/generate-qs8-vcvt.sh
index cee0dff..4a60ea8 100755
--- a/scripts/generate-qs8-vcvt.sh
+++ b/scripts/generate-qs8-vcvt.sh
@@ -54,13 +54,21 @@
tools/xngen src/qs8-vcvt/avx2.c.in -D BATCH_TILE=64 -D DATATYPE=QU8 -o src/qu8-vcvt/gen/vcvt-avx2-x64.c &
################################## WAsm SIMD ##################################
-tools/xngen src/qs8-vcvt/wasmsimd.c.in -D BATCH_TILE=8 -D DATATYPE=QS8 -o src/qs8-vcvt/gen/vcvt-wasmsimd-x8.c &
-tools/xngen src/qs8-vcvt/wasmsimd.c.in -D BATCH_TILE=16 -D DATATYPE=QS8 -o src/qs8-vcvt/gen/vcvt-wasmsimd-x16.c &
-tools/xngen src/qs8-vcvt/wasmsimd.c.in -D BATCH_TILE=32 -D DATATYPE=QS8 -o src/qs8-vcvt/gen/vcvt-wasmsimd-x32.c &
+tools/xngen src/qs8-vcvt/wasmsimd.c.in -D BATCH_TILE=8 -D RELAXED=0 -D DATATYPE=QS8 -o src/qs8-vcvt/gen/vcvt-wasmsimd-x8.c &
+tools/xngen src/qs8-vcvt/wasmsimd.c.in -D BATCH_TILE=16 -D RELAXED=0 -D DATATYPE=QS8 -o src/qs8-vcvt/gen/vcvt-wasmsimd-x16.c &
+tools/xngen src/qs8-vcvt/wasmsimd.c.in -D BATCH_TILE=32 -D RELAXED=0 -D DATATYPE=QS8 -o src/qs8-vcvt/gen/vcvt-wasmsimd-x32.c &
-tools/xngen src/qs8-vcvt/wasmsimd.c.in -D BATCH_TILE=8 -D DATATYPE=QU8 -o src/qu8-vcvt/gen/vcvt-wasmsimd-x8.c &
-tools/xngen src/qs8-vcvt/wasmsimd.c.in -D BATCH_TILE=16 -D DATATYPE=QU8 -o src/qu8-vcvt/gen/vcvt-wasmsimd-x16.c &
-tools/xngen src/qs8-vcvt/wasmsimd.c.in -D BATCH_TILE=32 -D DATATYPE=QU8 -o src/qu8-vcvt/gen/vcvt-wasmsimd-x32.c &
+tools/xngen src/qs8-vcvt/wasmsimd.c.in -D BATCH_TILE=8 -D RELAXED=1 -D DATATYPE=QS8 -o src/qs8-vcvt/gen/vcvt-wasmrelaxedsimd-x8.c &
+tools/xngen src/qs8-vcvt/wasmsimd.c.in -D BATCH_TILE=16 -D RELAXED=1 -D DATATYPE=QS8 -o src/qs8-vcvt/gen/vcvt-wasmrelaxedsimd-x16.c &
+tools/xngen src/qs8-vcvt/wasmsimd.c.in -D BATCH_TILE=32 -D RELAXED=1 -D DATATYPE=QS8 -o src/qs8-vcvt/gen/vcvt-wasmrelaxedsimd-x32.c &
+
+tools/xngen src/qs8-vcvt/wasmsimd.c.in -D BATCH_TILE=8 -D RELAXED=0 -D DATATYPE=QU8 -o src/qu8-vcvt/gen/vcvt-wasmsimd-x8.c &
+tools/xngen src/qs8-vcvt/wasmsimd.c.in -D BATCH_TILE=16 -D RELAXED=0 -D DATATYPE=QU8 -o src/qu8-vcvt/gen/vcvt-wasmsimd-x16.c &
+tools/xngen src/qs8-vcvt/wasmsimd.c.in -D BATCH_TILE=32 -D RELAXED=0 -D DATATYPE=QU8 -o src/qu8-vcvt/gen/vcvt-wasmsimd-x32.c &
+
+tools/xngen src/qs8-vcvt/wasmsimd.c.in -D BATCH_TILE=8 -D RELAXED=1 -D DATATYPE=QU8 -o src/qu8-vcvt/gen/vcvt-wasmrelaxedsimd-x8.c &
+tools/xngen src/qs8-vcvt/wasmsimd.c.in -D BATCH_TILE=16 -D RELAXED=1 -D DATATYPE=QU8 -o src/qu8-vcvt/gen/vcvt-wasmrelaxedsimd-x16.c &
+tools/xngen src/qs8-vcvt/wasmsimd.c.in -D BATCH_TILE=32 -D RELAXED=1 -D DATATYPE=QU8 -o src/qu8-vcvt/gen/vcvt-wasmrelaxedsimd-x32.c &
################################## ARMv6 SIMD #################################
tools/xngen src/qs8-vcvt/armv6simd.c.in -D BATCH_TILE=4 -D DATATYPE=QS8 -o src/qs8-vcvt/gen/vcvt-armv6simd-x4.c &
diff --git a/scripts/generate-qs8-vlrelu.sh b/scripts/generate-qs8-vlrelu.sh
index 08e1d00..1867b79 100755
--- a/scripts/generate-qs8-vlrelu.sh
+++ b/scripts/generate-qs8-vlrelu.sh
@@ -62,19 +62,33 @@
tools/xngen src/qs8-vlrelu/armv6simd.c.in -D BATCH_TILE=8 -D DATATYPE=QU8 -o src/qu8-vlrelu/gen/vlrelu-armv6simd-x8.c &
################################## WAsm SIMD ##################################
-tools/xngen src/qs8-vlrelu/wasmsimd-arm.c.in -D BATCH_TILE=16 -D DATATYPE=QS8 -o src/qs8-vlrelu/gen/vlrelu-wasmsimd-arm-x16.c &
-tools/xngen src/qs8-vlrelu/wasmsimd-arm.c.in -D BATCH_TILE=32 -D DATATYPE=QS8 -o src/qs8-vlrelu/gen/vlrelu-wasmsimd-arm-x32.c &
+tools/xngen src/qs8-vlrelu/wasmsimd-arm.c.in -D BATCH_TILE=16 -D RELAXED=0 -D DATATYPE=QS8 -o src/qs8-vlrelu/gen/vlrelu-wasmsimd-arm-x16.c &
+tools/xngen src/qs8-vlrelu/wasmsimd-arm.c.in -D BATCH_TILE=32 -D RELAXED=0 -D DATATYPE=QS8 -o src/qs8-vlrelu/gen/vlrelu-wasmsimd-arm-x32.c &
-tools/xngen src/qs8-vlrelu/wasmsimd-arm.c.in -D BATCH_TILE=16 -D DATATYPE=QU8 -o src/qu8-vlrelu/gen/vlrelu-wasmsimd-arm-x16.c &
-tools/xngen src/qs8-vlrelu/wasmsimd-arm.c.in -D BATCH_TILE=32 -D DATATYPE=QU8 -o src/qu8-vlrelu/gen/vlrelu-wasmsimd-arm-x32.c &
+tools/xngen src/qs8-vlrelu/wasmsimd-arm.c.in -D BATCH_TILE=16 -D RELAXED=0 -D DATATYPE=QU8 -o src/qu8-vlrelu/gen/vlrelu-wasmsimd-arm-x16.c &
+tools/xngen src/qs8-vlrelu/wasmsimd-arm.c.in -D BATCH_TILE=32 -D RELAXED=0 -D DATATYPE=QU8 -o src/qu8-vlrelu/gen/vlrelu-wasmsimd-arm-x32.c &
-tools/xngen src/qs8-vlrelu/wasmsimd-x86.c.in -D BATCH_TILE=8 -D DATATYPE=QS8 -o src/qs8-vlrelu/gen/vlrelu-wasmsimd-x86-x8.c &
-tools/xngen src/qs8-vlrelu/wasmsimd-x86.c.in -D BATCH_TILE=16 -D DATATYPE=QS8 -o src/qs8-vlrelu/gen/vlrelu-wasmsimd-x86-x16.c &
-tools/xngen src/qs8-vlrelu/wasmsimd-x86.c.in -D BATCH_TILE=32 -D DATATYPE=QS8 -o src/qs8-vlrelu/gen/vlrelu-wasmsimd-x86-x32.c &
+tools/xngen src/qs8-vlrelu/wasmsimd-arm.c.in -D BATCH_TILE=16 -D RELAXED=1 -D DATATYPE=QS8 -o src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-arm-x16.c &
+tools/xngen src/qs8-vlrelu/wasmsimd-arm.c.in -D BATCH_TILE=32 -D RELAXED=1 -D DATATYPE=QS8 -o src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-arm-x32.c &
-tools/xngen src/qs8-vlrelu/wasmsimd-x86.c.in -D BATCH_TILE=8 -D DATATYPE=QU8 -o src/qu8-vlrelu/gen/vlrelu-wasmsimd-x86-x8.c &
-tools/xngen src/qs8-vlrelu/wasmsimd-x86.c.in -D BATCH_TILE=16 -D DATATYPE=QU8 -o src/qu8-vlrelu/gen/vlrelu-wasmsimd-x86-x16.c &
-tools/xngen src/qs8-vlrelu/wasmsimd-x86.c.in -D BATCH_TILE=32 -D DATATYPE=QU8 -o src/qu8-vlrelu/gen/vlrelu-wasmsimd-x86-x32.c &
+tools/xngen src/qs8-vlrelu/wasmsimd-arm.c.in -D BATCH_TILE=16 -D RELAXED=1 -D DATATYPE=QU8 -o src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-arm-x16.c &
+tools/xngen src/qs8-vlrelu/wasmsimd-arm.c.in -D BATCH_TILE=32 -D RELAXED=1 -D DATATYPE=QU8 -o src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-arm-x32.c &
+
+tools/xngen src/qs8-vlrelu/wasmsimd-x86.c.in -D BATCH_TILE=8 -D RELAXED=0 -D DATATYPE=QS8 -o src/qs8-vlrelu/gen/vlrelu-wasmsimd-x86-x8.c &
+tools/xngen src/qs8-vlrelu/wasmsimd-x86.c.in -D BATCH_TILE=16 -D RELAXED=0 -D DATATYPE=QS8 -o src/qs8-vlrelu/gen/vlrelu-wasmsimd-x86-x16.c &
+tools/xngen src/qs8-vlrelu/wasmsimd-x86.c.in -D BATCH_TILE=32 -D RELAXED=0 -D DATATYPE=QS8 -o src/qs8-vlrelu/gen/vlrelu-wasmsimd-x86-x32.c &
+
+tools/xngen src/qs8-vlrelu/wasmsimd-x86.c.in -D BATCH_TILE=8 -D RELAXED=0 -D DATATYPE=QU8 -o src/qu8-vlrelu/gen/vlrelu-wasmsimd-x86-x8.c &
+tools/xngen src/qs8-vlrelu/wasmsimd-x86.c.in -D BATCH_TILE=16 -D RELAXED=0 -D DATATYPE=QU8 -o src/qu8-vlrelu/gen/vlrelu-wasmsimd-x86-x16.c &
+tools/xngen src/qs8-vlrelu/wasmsimd-x86.c.in -D BATCH_TILE=32 -D RELAXED=0 -D DATATYPE=QU8 -o src/qu8-vlrelu/gen/vlrelu-wasmsimd-x86-x32.c &
+
+tools/xngen src/qs8-vlrelu/wasmsimd-x86.c.in -D BATCH_TILE=8 -D RELAXED=1 -D DATATYPE=QS8 -o src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x8.c &
+tools/xngen src/qs8-vlrelu/wasmsimd-x86.c.in -D BATCH_TILE=16 -D RELAXED=1 -D DATATYPE=QS8 -o src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x16.c &
+tools/xngen src/qs8-vlrelu/wasmsimd-x86.c.in -D BATCH_TILE=32 -D RELAXED=1 -D DATATYPE=QS8 -o src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x32.c &
+
+tools/xngen src/qs8-vlrelu/wasmsimd-x86.c.in -D BATCH_TILE=8 -D RELAXED=1 -D DATATYPE=QU8 -o src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x8.c &
+tools/xngen src/qs8-vlrelu/wasmsimd-x86.c.in -D BATCH_TILE=16 -D RELAXED=1 -D DATATYPE=QU8 -o src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x16.c &
+tools/xngen src/qs8-vlrelu/wasmsimd-x86.c.in -D BATCH_TILE=32 -D RELAXED=1 -D DATATYPE=QU8 -o src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x32.c &
#################################### Scalar ###################################
tools/xngen src/qs8-vlrelu/scalar-select.c.in -D BATCH_TILE=1 -D DATATYPE=QS8 -o src/qs8-vlrelu/gen/vlrelu-scalar-select-x1.c &
diff --git a/src/qs8-vcvt/gen/vcvt-wasmrelaxedsimd-x16.c b/src/qs8-vcvt/gen/vcvt-wasmrelaxedsimd-x16.c
new file mode 100644
index 0000000..a254b5b
--- /dev/null
+++ b/src/qs8-vcvt/gen/vcvt-wasmrelaxedsimd-x16.c
@@ -0,0 +1,91 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vcvt/wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x16(
+ size_t n,
+ const int8_t* x,
+ int8_t* y,
+ const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(int8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd.input_zero_point);
+ const v128_t vmultiplier = wasm_v128_load64_splat(params->wasmsimd.multiplier);
+ const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd.output_zero_point);
+ for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
+ v128_t vacc0 = wasm_i16x8_load8x8(x);
+ v128_t vacc1 = wasm_i16x8_load8x8(x + 8);
+ x += 16;
+
+ vacc0 = wasm_i16x8_sub(vinput_zero_point, vacc0);
+ vacc1 = wasm_i16x8_sub(vinput_zero_point, vacc1);
+
+ vacc0 = wasm_i16x8_shl(vacc0, 7);
+ vacc1 = wasm_i16x8_shl(vacc1, 7);
+
+ vacc0 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc0, vmultiplier);
+ vacc1 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc1, vmultiplier);
+
+ vacc0 = wasm_i16x8_add_sat(vacc0, voutput_zero_point);
+ vacc1 = wasm_i16x8_add_sat(vacc1, voutput_zero_point);
+
+ const v128_t vy0 = wasm_i8x16_narrow_i16x8(vacc0, vacc1);
+
+ wasm_v128_store(y, vy0);
+ y += 16;
+ }
+ for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
+ v128_t vacc = wasm_i16x8_load8x8(x);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+ x += 8;
+
+ const v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
+ wasm_v128_store64_lane(y, vy, 0);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(int8_t));
+ assert(n <= 7 * sizeof(int8_t));
+
+ v128_t vacc = wasm_i16x8_load8x8(x);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+
+ v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
+ if (n & (4 * sizeof(int8_t))) {
+ wasm_v128_store32_lane(y, vy, 0);
+ vy = wasm_u64x2_shr(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ wasm_v128_store16_lane(y, vy, 0);
+ vy = wasm_u32x4_shr(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ wasm_v128_store8_lane(y, vy, 0);
+ }
+ }
+}
diff --git a/src/qs8-vcvt/gen/vcvt-wasmrelaxedsimd-x32.c b/src/qs8-vcvt/gen/vcvt-wasmrelaxedsimd-x32.c
new file mode 100644
index 0000000..23ea278
--- /dev/null
+++ b/src/qs8-vcvt/gen/vcvt-wasmrelaxedsimd-x32.c
@@ -0,0 +1,103 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vcvt/wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x32(
+ size_t n,
+ const int8_t* x,
+ int8_t* y,
+ const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(int8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd.input_zero_point);
+ const v128_t vmultiplier = wasm_v128_load64_splat(params->wasmsimd.multiplier);
+ const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd.output_zero_point);
+ for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
+ v128_t vacc0 = wasm_i16x8_load8x8(x);
+ v128_t vacc1 = wasm_i16x8_load8x8(x + 8);
+ v128_t vacc2 = wasm_i16x8_load8x8(x + 16);
+ v128_t vacc3 = wasm_i16x8_load8x8(x + 24);
+ x += 32;
+
+ vacc0 = wasm_i16x8_sub(vinput_zero_point, vacc0);
+ vacc1 = wasm_i16x8_sub(vinput_zero_point, vacc1);
+ vacc2 = wasm_i16x8_sub(vinput_zero_point, vacc2);
+ vacc3 = wasm_i16x8_sub(vinput_zero_point, vacc3);
+
+ vacc0 = wasm_i16x8_shl(vacc0, 7);
+ vacc1 = wasm_i16x8_shl(vacc1, 7);
+ vacc2 = wasm_i16x8_shl(vacc2, 7);
+ vacc3 = wasm_i16x8_shl(vacc3, 7);
+
+ vacc0 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc0, vmultiplier);
+ vacc1 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc1, vmultiplier);
+ vacc2 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc2, vmultiplier);
+ vacc3 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc3, vmultiplier);
+
+ vacc0 = wasm_i16x8_add_sat(vacc0, voutput_zero_point);
+ vacc1 = wasm_i16x8_add_sat(vacc1, voutput_zero_point);
+ vacc2 = wasm_i16x8_add_sat(vacc2, voutput_zero_point);
+ vacc3 = wasm_i16x8_add_sat(vacc3, voutput_zero_point);
+
+ const v128_t vy0 = wasm_i8x16_narrow_i16x8(vacc0, vacc1);
+ const v128_t vy1 = wasm_i8x16_narrow_i16x8(vacc2, vacc3);
+
+ wasm_v128_store(y, vy0);
+ wasm_v128_store((y + 16), vy1);
+ y += 32;
+ }
+ for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
+ v128_t vacc = wasm_i16x8_load8x8(x);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+ x += 8;
+
+ const v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
+ wasm_v128_store64_lane(y, vy, 0);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(int8_t));
+ assert(n <= 7 * sizeof(int8_t));
+
+ v128_t vacc = wasm_i16x8_load8x8(x);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+
+ v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
+ if (n & (4 * sizeof(int8_t))) {
+ wasm_v128_store32_lane(y, vy, 0);
+ vy = wasm_u64x2_shr(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ wasm_v128_store16_lane(y, vy, 0);
+ vy = wasm_u32x4_shr(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ wasm_v128_store8_lane(y, vy, 0);
+ }
+ }
+}
diff --git a/src/qs8-vcvt/gen/vcvt-wasmrelaxedsimd-x8.c b/src/qs8-vcvt/gen/vcvt-wasmrelaxedsimd-x8.c
new file mode 100644
index 0000000..e76b491
--- /dev/null
+++ b/src/qs8-vcvt/gen/vcvt-wasmrelaxedsimd-x8.c
@@ -0,0 +1,69 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vcvt/wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x8(
+ size_t n,
+ const int8_t* x,
+ int8_t* y,
+ const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(int8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd.input_zero_point);
+ const v128_t vmultiplier = wasm_v128_load64_splat(params->wasmsimd.multiplier);
+ const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd.output_zero_point);
+ for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
+ v128_t vacc = wasm_i16x8_load8x8(x);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+ x += 8;
+
+ const v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
+ wasm_v128_store64_lane(y, vy, 0);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(int8_t));
+ assert(n <= 7 * sizeof(int8_t));
+
+ v128_t vacc = wasm_i16x8_load8x8(x);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+
+ v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
+ if (n & (4 * sizeof(int8_t))) {
+ wasm_v128_store32_lane(y, vy, 0);
+ vy = wasm_u64x2_shr(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ wasm_v128_store16_lane(y, vy, 0);
+ vy = wasm_u32x4_shr(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ wasm_v128_store8_lane(y, vy, 0);
+ }
+ }
+}
diff --git a/src/qs8-vcvt/wasmsimd.c.in b/src/qs8-vcvt/wasmsimd.c.in
index a28010b..49546d0 100644
--- a/src/qs8-vcvt/wasmsimd.c.in
+++ b/src/qs8-vcvt/wasmsimd.c.in
@@ -17,8 +17,10 @@
$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
$WASM_X16X8_LOAD8X8 = {"QS8": "wasm_i16x8_load8x8", "QU8": "wasm_u16x8_load8x8"}[DATATYPE]
+$WASM_I16X8_Q15MULR = "__builtin_wasm_relaxed_q15mulr_s_i16x8" if RELAXED else "wasm_i16x8_q15mulr_sat"
$WASM_X8X16_NARROW_I16X8 = {"QS8": "wasm_i8x16_narrow_i16x8", "QU8": "wasm_u8x16_narrow_i16x8"}[DATATYPE]
-void xnn_${DATATYPE.lower()}_vcvt_ukernel__wasmsimd_x${BATCH_TILE}(
+$ISA = "wasmrelaxedsimd" if RELAXED else "wasmsimd"
+void xnn_${DATATYPE.lower()}_vcvt_ukernel__${ISA}_x${BATCH_TILE}(
size_t n,
const ${XINT8_T}* x,
${XINT8_T}* y,
@@ -46,7 +48,7 @@
vacc${ABC[N]} = wasm_i16x8_shl(vacc${ABC[N]}, 7);
$for N in range(2*SIMD_TILE):
- vacc${ABC[N]} = wasm_i16x8_q15mulr_sat(vacc${ABC[N]}, vmultiplier);
+ vacc${ABC[N]} = ${WASM_I16X8_Q15MULR}(vacc${ABC[N]}, vmultiplier);
$for N in range(2*SIMD_TILE):
vacc${ABC[N]} = wasm_i16x8_add_sat(vacc${ABC[N]}, voutput_zero_point);
@@ -63,7 +65,7 @@
v128_t vacc = ${WASM_X16X8_LOAD8X8}(x);
vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
vacc = wasm_i16x8_shl(vacc, 7);
- vacc = wasm_i16x8_q15mulr_sat(vacc, vmultiplier);
+ vacc = ${WASM_I16X8_Q15MULR}(vacc, vmultiplier);
vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
x += 8;
@@ -78,7 +80,7 @@
v128_t vacc = ${WASM_X16X8_LOAD8X8}(x);
vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
vacc = wasm_i16x8_shl(vacc, 7);
- vacc = wasm_i16x8_q15mulr_sat(vacc, vmultiplier);
+ vacc = ${WASM_I16X8_Q15MULR}(vacc, vmultiplier);
vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
v128_t vy = ${WASM_X8X16_NARROW_I16X8}(vacc, vacc);
diff --git a/src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-arm-x16.c b/src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-arm-x16.c
new file mode 100644
index 0000000..8d1d22a
--- /dev/null
+++ b/src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-arm-x16.c
@@ -0,0 +1,99 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vlrelu/wasmsimd-arm.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16(
+ size_t n,
+ const int8_t* x,
+ int8_t* y,
+ const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(int8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd_arm.input_zero_point);
+ const v128_t vpositive_multiplier = wasm_v128_load64_splat(params->wasmsimd_arm.positive_multiplier);
+ const v128_t vnegative_multiplier = wasm_v128_load64_splat(params->wasmsimd_arm.negative_multiplier);
+ const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd_arm.output_zero_point);
+ for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
+ v128_t vx0 = wasm_v128_load(x);
+ x += 16;
+
+ v128_t vacc0 = wasm_i16x8_sub(vinput_zero_point, wasm_i16x8_extend_low_i8x16(vx0));
+ v128_t vacc1 = wasm_i16x8_sub(vinput_zero_point, wasm_i16x8_extend_high_i8x16(vx0));
+ v128_t vmultiplier0 = wasm_i16x8_shr(vacc0, 15);
+ v128_t vmultiplier1 = wasm_i16x8_shr(vacc1, 15);
+
+ vacc0 = wasm_i16x8_shl(vacc0, 7);
+ vmultiplier0 = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier0);
+ vacc1 = wasm_i16x8_shl(vacc1, 7);
+ vmultiplier1 = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier1);
+
+ vacc0 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc0, vmultiplier0);
+ vacc1 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc1, vmultiplier1);
+
+ vacc0 = wasm_i16x8_add_sat(vacc0, voutput_zero_point);
+ vacc1 = wasm_i16x8_add_sat(vacc1, voutput_zero_point);
+
+ const v128_t vy0 = wasm_i8x16_narrow_i16x8(vacc0, vacc1);
+
+ wasm_v128_store(y, vy0);
+ y += 16;
+ }
+ for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
+ const v128_t vx = wasm_i16x8_load8x8(x);
+ v128_t vacc = wasm_i16x8_sub(vinput_zero_point, vx);
+ v128_t vmultiplier = wasm_i16x8_shr(vacc, 15);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vmultiplier = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+ x += 8;
+
+ const v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
+ wasm_v128_store64_lane(y, vy, 0);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(int8_t));
+ assert(n <= 7 * sizeof(int8_t));
+
+ const v128_t vx = wasm_i16x8_load8x8(x);
+ v128_t vacc = wasm_i16x8_sub(vinput_zero_point, vx);
+ v128_t vmultiplier = wasm_i16x8_shr(vacc, 15);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vmultiplier = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+
+ v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
+ if (n & (4 * sizeof(int8_t))) {
+ wasm_v128_store32_lane(y, vy, 0);
+ vy = wasm_u64x2_shr(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ wasm_v128_store16_lane(y, vy, 0);
+ vy = wasm_u32x4_shr(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ wasm_v128_store8_lane(y, vy, 0);
+ }
+ }
+}
diff --git a/src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-arm-x32.c b/src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-arm-x32.c
new file mode 100644
index 0000000..20dbbcb
--- /dev/null
+++ b/src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-arm-x32.c
@@ -0,0 +1,114 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vlrelu/wasmsimd-arm.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32(
+ size_t n,
+ const int8_t* x,
+ int8_t* y,
+ const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(int8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd_arm.input_zero_point);
+ const v128_t vpositive_multiplier = wasm_v128_load64_splat(params->wasmsimd_arm.positive_multiplier);
+ const v128_t vnegative_multiplier = wasm_v128_load64_splat(params->wasmsimd_arm.negative_multiplier);
+ const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd_arm.output_zero_point);
+ for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
+ v128_t vx0 = wasm_v128_load(x);
+ v128_t vx1 = wasm_v128_load(x + 16);
+ x += 32;
+
+ v128_t vacc0 = wasm_i16x8_sub(vinput_zero_point, wasm_i16x8_extend_low_i8x16(vx0));
+ v128_t vacc1 = wasm_i16x8_sub(vinput_zero_point, wasm_i16x8_extend_high_i8x16(vx0));
+ v128_t vmultiplier0 = wasm_i16x8_shr(vacc0, 15);
+ v128_t vmultiplier1 = wasm_i16x8_shr(vacc1, 15);
+ v128_t vacc2 = wasm_i16x8_sub(vinput_zero_point, wasm_i16x8_extend_low_i8x16(vx1));
+ v128_t vacc3 = wasm_i16x8_sub(vinput_zero_point, wasm_i16x8_extend_high_i8x16(vx1));
+ v128_t vmultiplier2 = wasm_i16x8_shr(vacc2, 15);
+ v128_t vmultiplier3 = wasm_i16x8_shr(vacc3, 15);
+
+ vacc0 = wasm_i16x8_shl(vacc0, 7);
+ vmultiplier0 = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier0);
+ vacc1 = wasm_i16x8_shl(vacc1, 7);
+ vmultiplier1 = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier1);
+ vacc2 = wasm_i16x8_shl(vacc2, 7);
+ vmultiplier2 = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier2);
+ vacc3 = wasm_i16x8_shl(vacc3, 7);
+ vmultiplier3 = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier3);
+
+ vacc0 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc0, vmultiplier0);
+ vacc1 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc1, vmultiplier1);
+ vacc2 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc2, vmultiplier2);
+ vacc3 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc3, vmultiplier3);
+
+ vacc0 = wasm_i16x8_add_sat(vacc0, voutput_zero_point);
+ vacc1 = wasm_i16x8_add_sat(vacc1, voutput_zero_point);
+ vacc2 = wasm_i16x8_add_sat(vacc2, voutput_zero_point);
+ vacc3 = wasm_i16x8_add_sat(vacc3, voutput_zero_point);
+
+ const v128_t vy0 = wasm_i8x16_narrow_i16x8(vacc0, vacc1);
+ const v128_t vy1 = wasm_i8x16_narrow_i16x8(vacc2, vacc3);
+
+ wasm_v128_store(y, vy0);
+ wasm_v128_store((y + 16), vy1);
+ y += 32;
+ }
+ for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
+ const v128_t vx = wasm_i16x8_load8x8(x);
+ v128_t vacc = wasm_i16x8_sub(vinput_zero_point, vx);
+ v128_t vmultiplier = wasm_i16x8_shr(vacc, 15);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vmultiplier = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+ x += 8;
+
+ const v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
+ wasm_v128_store64_lane(y, vy, 0);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(int8_t));
+ assert(n <= 7 * sizeof(int8_t));
+
+ const v128_t vx = wasm_i16x8_load8x8(x);
+ v128_t vacc = wasm_i16x8_sub(vinput_zero_point, vx);
+ v128_t vmultiplier = wasm_i16x8_shr(vacc, 15);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vmultiplier = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+
+ v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
+ if (n & (4 * sizeof(int8_t))) {
+ wasm_v128_store32_lane(y, vy, 0);
+ vy = wasm_u64x2_shr(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ wasm_v128_store16_lane(y, vy, 0);
+ vy = wasm_u32x4_shr(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ wasm_v128_store8_lane(y, vy, 0);
+ }
+ }
+}
diff --git a/src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x16.c b/src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x16.c
new file mode 100644
index 0000000..de5ff89
--- /dev/null
+++ b/src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x16.c
@@ -0,0 +1,104 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vlrelu/wasmsimd-x86.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16(
+ size_t n,
+ const int8_t* x,
+ int8_t* y,
+ const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(int8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd_x86.input_zero_point);
+ const v128_t vmultiplier_diff = wasm_v128_load64_splat(params->wasmsimd_x86.multiplier_diff);
+ const v128_t vmultiplier_base = wasm_v128_load64_splat(params->wasmsimd_x86.multiplier_base);
+ const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd_x86.output_zero_point);
+ for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
+ v128_t vacc0 = wasm_i16x8_load8x8(x);
+ v128_t vacc1 = wasm_i16x8_load8x8(x + 8);
+ x += 16;
+
+ v128_t vmultiplier0 = wasm_i16x8_gt(vacc0, vinput_zero_point);
+ vacc0 = wasm_i16x8_sub(vinput_zero_point, vacc0);
+ v128_t vmultiplier1 = wasm_i16x8_gt(vacc1, vinput_zero_point);
+ vacc1 = wasm_i16x8_sub(vinput_zero_point, vacc1);
+
+ vmultiplier0 = wasm_v128_and(vmultiplier0, vmultiplier_diff);
+ vacc0 = wasm_i16x8_shl(vacc0, 7);
+ vmultiplier0 = wasm_v128_xor(vmultiplier0, vmultiplier_base);
+ vmultiplier1 = wasm_v128_and(vmultiplier1, vmultiplier_diff);
+ vacc1 = wasm_i16x8_shl(vacc1, 7);
+ vmultiplier1 = wasm_v128_xor(vmultiplier1, vmultiplier_base);
+
+ vacc0 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc0, vmultiplier0);
+ vacc1 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc1, vmultiplier1);
+
+ vacc0 = wasm_i16x8_add_sat(vacc0, voutput_zero_point);
+ vacc1 = wasm_i16x8_add_sat(vacc1, voutput_zero_point);
+
+ const v128_t vy0 = wasm_i8x16_narrow_i16x8(vacc0, vacc1);
+
+ wasm_v128_store(y, vy0);
+ y += 16;
+ }
+ for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
+ v128_t vacc = wasm_i16x8_load8x8(x);
+ v128_t vmultiplier = wasm_i16x8_gt(vacc, vinput_zero_point);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vmultiplier = wasm_v128_and(vmultiplier, vmultiplier_diff);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vmultiplier = wasm_v128_xor(vmultiplier, vmultiplier_base);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+ x += 8;
+
+ const v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
+ wasm_v128_store64_lane(y, vy, 0);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(int8_t));
+ assert(n <= 7 * sizeof(int8_t));
+
+ v128_t vacc = wasm_i16x8_load8x8(x);
+ v128_t vmultiplier = wasm_i16x8_gt(vacc, vinput_zero_point);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vmultiplier = wasm_v128_and(vmultiplier, vmultiplier_diff);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vmultiplier = wasm_v128_xor(vmultiplier, vmultiplier_base);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+
+ v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
+ if (n & (4 * sizeof(int8_t))) {
+ wasm_v128_store32_lane(y, vy, 0);
+ vy = wasm_u64x2_shr(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ wasm_v128_store16_lane(y, vy, 0);
+ vy = wasm_u32x4_shr(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ wasm_v128_store8_lane(y, vy, 0);
+ }
+ }
+}
diff --git a/src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x32.c b/src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x32.c
new file mode 100644
index 0000000..8766842
--- /dev/null
+++ b/src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x32.c
@@ -0,0 +1,122 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vlrelu/wasmsimd-x86.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32(
+ size_t n,
+ const int8_t* x,
+ int8_t* y,
+ const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(int8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd_x86.input_zero_point);
+ const v128_t vmultiplier_diff = wasm_v128_load64_splat(params->wasmsimd_x86.multiplier_diff);
+ const v128_t vmultiplier_base = wasm_v128_load64_splat(params->wasmsimd_x86.multiplier_base);
+ const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd_x86.output_zero_point);
+ for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
+ v128_t vacc0 = wasm_i16x8_load8x8(x);
+ v128_t vacc1 = wasm_i16x8_load8x8(x + 8);
+ v128_t vacc2 = wasm_i16x8_load8x8(x + 16);
+ v128_t vacc3 = wasm_i16x8_load8x8(x + 24);
+ x += 32;
+
+ v128_t vmultiplier0 = wasm_i16x8_gt(vacc0, vinput_zero_point);
+ vacc0 = wasm_i16x8_sub(vinput_zero_point, vacc0);
+ v128_t vmultiplier1 = wasm_i16x8_gt(vacc1, vinput_zero_point);
+ vacc1 = wasm_i16x8_sub(vinput_zero_point, vacc1);
+ v128_t vmultiplier2 = wasm_i16x8_gt(vacc2, vinput_zero_point);
+ vacc2 = wasm_i16x8_sub(vinput_zero_point, vacc2);
+ v128_t vmultiplier3 = wasm_i16x8_gt(vacc3, vinput_zero_point);
+ vacc3 = wasm_i16x8_sub(vinput_zero_point, vacc3);
+
+ vmultiplier0 = wasm_v128_and(vmultiplier0, vmultiplier_diff);
+ vacc0 = wasm_i16x8_shl(vacc0, 7);
+ vmultiplier0 = wasm_v128_xor(vmultiplier0, vmultiplier_base);
+ vmultiplier1 = wasm_v128_and(vmultiplier1, vmultiplier_diff);
+ vacc1 = wasm_i16x8_shl(vacc1, 7);
+ vmultiplier1 = wasm_v128_xor(vmultiplier1, vmultiplier_base);
+ vmultiplier2 = wasm_v128_and(vmultiplier2, vmultiplier_diff);
+ vacc2 = wasm_i16x8_shl(vacc2, 7);
+ vmultiplier2 = wasm_v128_xor(vmultiplier2, vmultiplier_base);
+ vmultiplier3 = wasm_v128_and(vmultiplier3, vmultiplier_diff);
+ vacc3 = wasm_i16x8_shl(vacc3, 7);
+ vmultiplier3 = wasm_v128_xor(vmultiplier3, vmultiplier_base);
+
+ vacc0 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc0, vmultiplier0);
+ vacc1 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc1, vmultiplier1);
+ vacc2 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc2, vmultiplier2);
+ vacc3 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc3, vmultiplier3);
+
+ vacc0 = wasm_i16x8_add_sat(vacc0, voutput_zero_point);
+ vacc1 = wasm_i16x8_add_sat(vacc1, voutput_zero_point);
+ vacc2 = wasm_i16x8_add_sat(vacc2, voutput_zero_point);
+ vacc3 = wasm_i16x8_add_sat(vacc3, voutput_zero_point);
+
+ const v128_t vy0 = wasm_i8x16_narrow_i16x8(vacc0, vacc1);
+ const v128_t vy1 = wasm_i8x16_narrow_i16x8(vacc2, vacc3);
+
+ wasm_v128_store(y, vy0);
+ wasm_v128_store((y + 16), vy1);
+ y += 32;
+ }
+ for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
+ v128_t vacc = wasm_i16x8_load8x8(x);
+ v128_t vmultiplier = wasm_i16x8_gt(vacc, vinput_zero_point);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vmultiplier = wasm_v128_and(vmultiplier, vmultiplier_diff);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vmultiplier = wasm_v128_xor(vmultiplier, vmultiplier_base);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+ x += 8;
+
+ const v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
+ wasm_v128_store64_lane(y, vy, 0);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(int8_t));
+ assert(n <= 7 * sizeof(int8_t));
+
+ v128_t vacc = wasm_i16x8_load8x8(x);
+ v128_t vmultiplier = wasm_i16x8_gt(vacc, vinput_zero_point);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vmultiplier = wasm_v128_and(vmultiplier, vmultiplier_diff);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vmultiplier = wasm_v128_xor(vmultiplier, vmultiplier_base);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+
+ v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
+ if (n & (4 * sizeof(int8_t))) {
+ wasm_v128_store32_lane(y, vy, 0);
+ vy = wasm_u64x2_shr(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ wasm_v128_store16_lane(y, vy, 0);
+ vy = wasm_u32x4_shr(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ wasm_v128_store8_lane(y, vy, 0);
+ }
+ }
+}
diff --git a/src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x8.c b/src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x8.c
new file mode 100644
index 0000000..97b70e8
--- /dev/null
+++ b/src/qs8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x8.c
@@ -0,0 +1,76 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vlrelu/wasmsimd-x86.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8(
+ size_t n,
+ const int8_t* x,
+ int8_t* y,
+ const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(int8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd_x86.input_zero_point);
+ const v128_t vmultiplier_diff = wasm_v128_load64_splat(params->wasmsimd_x86.multiplier_diff);
+ const v128_t vmultiplier_base = wasm_v128_load64_splat(params->wasmsimd_x86.multiplier_base);
+ const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd_x86.output_zero_point);
+ for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
+ v128_t vacc = wasm_i16x8_load8x8(x);
+ v128_t vmultiplier = wasm_i16x8_gt(vacc, vinput_zero_point);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vmultiplier = wasm_v128_and(vmultiplier, vmultiplier_diff);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vmultiplier = wasm_v128_xor(vmultiplier, vmultiplier_base);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+ x += 8;
+
+ const v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
+ wasm_v128_store64_lane(y, vy, 0);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(int8_t));
+ assert(n <= 7 * sizeof(int8_t));
+
+ v128_t vacc = wasm_i16x8_load8x8(x);
+ v128_t vmultiplier = wasm_i16x8_gt(vacc, vinput_zero_point);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vmultiplier = wasm_v128_and(vmultiplier, vmultiplier_diff);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vmultiplier = wasm_v128_xor(vmultiplier, vmultiplier_base);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+
+ v128_t vy = wasm_i8x16_narrow_i16x8(vacc, vacc);
+ if (n & (4 * sizeof(int8_t))) {
+ wasm_v128_store32_lane(y, vy, 0);
+ vy = wasm_u64x2_shr(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ wasm_v128_store16_lane(y, vy, 0);
+ vy = wasm_u32x4_shr(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ wasm_v128_store8_lane(y, vy, 0);
+ }
+ }
+}
diff --git a/src/qs8-vlrelu/wasmsimd-arm.c.in b/src/qs8-vlrelu/wasmsimd-arm.c.in
index 74a3a6b..b075385 100644
--- a/src/qs8-vlrelu/wasmsimd-arm.c.in
+++ b/src/qs8-vlrelu/wasmsimd-arm.c.in
@@ -19,8 +19,10 @@
$WASM_X16X8_LOAD8X8 = {"QS8": "wasm_i16x8_load8x8", "QU8": "wasm_u16x8_load8x8"}[DATATYPE]
$WASM_X16X8_EXTEND_LOW_X8X16 = {"QS8": "wasm_i16x8_extend_low_i8x16", "QU8": "wasm_u16x8_extend_low_u8x16"}[DATATYPE]
$WASM_X16X8_EXTEND_HIGH_X8X16 = {"QS8": "wasm_i16x8_extend_high_i8x16", "QU8": "wasm_u16x8_extend_high_u8x16"}[DATATYPE]
+$WASM_I16X8_Q15MULR = "__builtin_wasm_relaxed_q15mulr_s_i16x8" if RELAXED else "wasm_i16x8_q15mulr_sat"
$WASM_X8X16_NARROW_I16X8 = {"QS8": "wasm_i8x16_narrow_i16x8", "QU8": "wasm_u8x16_narrow_i16x8"}[DATATYPE]
-void xnn_${DATATYPE.lower()}_vlrelu_ukernel__wasmsimd_arm_x${BATCH_TILE}(
+$ISA = "wasmrelaxedsimd" if RELAXED else "wasmsimd"
+void xnn_${DATATYPE.lower()}_vlrelu_ukernel__${ISA}_arm_x${BATCH_TILE}(
size_t n,
const ${XINT8_T}* x,
${XINT8_T}* y,
@@ -52,7 +54,7 @@
vmultiplier${ABC[N]} = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier${ABC[N]});
$for N in range(2*SIMD_TILE):
- vacc${ABC[N]} = wasm_i16x8_q15mulr_sat(vacc${ABC[N]}, vmultiplier${ABC[N]});
+ vacc${ABC[N]} = ${WASM_I16X8_Q15MULR}(vacc${ABC[N]}, vmultiplier${ABC[N]});
$for N in range(2*SIMD_TILE):
vacc${ABC[N]} = wasm_i16x8_add_sat(vacc${ABC[N]}, voutput_zero_point);
@@ -71,7 +73,7 @@
v128_t vmultiplier = wasm_i16x8_shr(vacc, 15);
vacc = wasm_i16x8_shl(vacc, 7);
vmultiplier = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier);
- vacc = wasm_i16x8_q15mulr_sat(vacc, vmultiplier);
+ vacc = ${WASM_I16X8_Q15MULR}(vacc, vmultiplier);
vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
x += 8;
@@ -88,7 +90,7 @@
v128_t vmultiplier = wasm_i16x8_shr(vacc, 15);
vacc = wasm_i16x8_shl(vacc, 7);
vmultiplier = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier);
- vacc = wasm_i16x8_q15mulr_sat(vacc, vmultiplier);
+ vacc = ${WASM_I16X8_Q15MULR}(vacc, vmultiplier);
vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
v128_t vy = ${WASM_X8X16_NARROW_I16X8}(vacc, vacc);
diff --git a/src/qs8-vlrelu/wasmsimd-x86.c.in b/src/qs8-vlrelu/wasmsimd-x86.c.in
index fdec6ed..3f44c37 100644
--- a/src/qs8-vlrelu/wasmsimd-x86.c.in
+++ b/src/qs8-vlrelu/wasmsimd-x86.c.in
@@ -17,8 +17,10 @@
$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
$WASM_X16X8_LOAD8X8 = {"QS8": "wasm_i16x8_load8x8", "QU8": "wasm_u16x8_load8x8"}[DATATYPE]
+$WASM_I16X8_Q15MULR = "__builtin_wasm_relaxed_q15mulr_s_i16x8" if RELAXED else "wasm_i16x8_q15mulr_sat"
$WASM_X8X16_NARROW_I16X8 = {"QS8": "wasm_i8x16_narrow_i16x8", "QU8": "wasm_u8x16_narrow_i16x8"}[DATATYPE]
-void xnn_${DATATYPE.lower()}_vlrelu_ukernel__wasmsimd_x86_x${BATCH_TILE}(
+$ISA = "wasmrelaxedsimd" if RELAXED else "wasmsimd"
+void xnn_${DATATYPE.lower()}_vlrelu_ukernel__${ISA}_x86_x${BATCH_TILE}(
size_t n,
const ${XINT8_T}* x,
${XINT8_T}* y,
@@ -50,7 +52,7 @@
vmultiplier${ABC[N]} = wasm_v128_xor(vmultiplier${ABC[N]}, vmultiplier_base);
$for N in range(2*SIMD_TILE):
- vacc${ABC[N]} = wasm_i16x8_q15mulr_sat(vacc${ABC[N]}, vmultiplier${ABC[N]});
+ vacc${ABC[N]} = ${WASM_I16X8_Q15MULR}(vacc${ABC[N]}, vmultiplier${ABC[N]});
$for N in range(2*SIMD_TILE):
vacc${ABC[N]} = wasm_i16x8_add_sat(vacc${ABC[N]}, voutput_zero_point);
@@ -70,7 +72,7 @@
vmultiplier = wasm_v128_and(vmultiplier, vmultiplier_diff);
vacc = wasm_i16x8_shl(vacc, 7);
vmultiplier = wasm_v128_xor(vmultiplier, vmultiplier_base);
- vacc = wasm_i16x8_q15mulr_sat(vacc, vmultiplier);
+ vacc = ${WASM_I16X8_Q15MULR}(vacc, vmultiplier);
vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
x += 8;
@@ -88,7 +90,7 @@
vmultiplier = wasm_v128_and(vmultiplier, vmultiplier_diff);
vacc = wasm_i16x8_shl(vacc, 7);
vmultiplier = wasm_v128_xor(vmultiplier, vmultiplier_base);
- vacc = wasm_i16x8_q15mulr_sat(vacc, vmultiplier);
+ vacc = ${WASM_I16X8_Q15MULR}(vacc, vmultiplier);
vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
v128_t vy = ${WASM_X8X16_NARROW_I16X8}(vacc, vacc);
diff --git a/src/qu8-vcvt/gen/vcvt-wasmrelaxedsimd-x16.c b/src/qu8-vcvt/gen/vcvt-wasmrelaxedsimd-x16.c
new file mode 100644
index 0000000..12593c5
--- /dev/null
+++ b/src/qu8-vcvt/gen/vcvt-wasmrelaxedsimd-x16.c
@@ -0,0 +1,91 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vcvt/wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x16(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(uint8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd.input_zero_point);
+ const v128_t vmultiplier = wasm_v128_load64_splat(params->wasmsimd.multiplier);
+ const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd.output_zero_point);
+ for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+ v128_t vacc0 = wasm_u16x8_load8x8(x);
+ v128_t vacc1 = wasm_u16x8_load8x8(x + 8);
+ x += 16;
+
+ vacc0 = wasm_i16x8_sub(vinput_zero_point, vacc0);
+ vacc1 = wasm_i16x8_sub(vinput_zero_point, vacc1);
+
+ vacc0 = wasm_i16x8_shl(vacc0, 7);
+ vacc1 = wasm_i16x8_shl(vacc1, 7);
+
+ vacc0 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc0, vmultiplier);
+ vacc1 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc1, vmultiplier);
+
+ vacc0 = wasm_i16x8_add_sat(vacc0, voutput_zero_point);
+ vacc1 = wasm_i16x8_add_sat(vacc1, voutput_zero_point);
+
+ const v128_t vy0 = wasm_u8x16_narrow_i16x8(vacc0, vacc1);
+
+ wasm_v128_store(y, vy0);
+ y += 16;
+ }
+ for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+ v128_t vacc = wasm_u16x8_load8x8(x);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+ x += 8;
+
+ const v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
+ wasm_v128_store64_lane(y, vy, 0);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(uint8_t));
+ assert(n <= 7 * sizeof(uint8_t));
+
+ v128_t vacc = wasm_u16x8_load8x8(x);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+
+ v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
+ if (n & (4 * sizeof(uint8_t))) {
+ wasm_v128_store32_lane(y, vy, 0);
+ vy = wasm_u64x2_shr(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ wasm_v128_store16_lane(y, vy, 0);
+ vy = wasm_u32x4_shr(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ wasm_v128_store8_lane(y, vy, 0);
+ }
+ }
+}
diff --git a/src/qu8-vcvt/gen/vcvt-wasmrelaxedsimd-x32.c b/src/qu8-vcvt/gen/vcvt-wasmrelaxedsimd-x32.c
new file mode 100644
index 0000000..6cdf3be
--- /dev/null
+++ b/src/qu8-vcvt/gen/vcvt-wasmrelaxedsimd-x32.c
@@ -0,0 +1,103 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vcvt/wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x32(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(uint8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd.input_zero_point);
+ const v128_t vmultiplier = wasm_v128_load64_splat(params->wasmsimd.multiplier);
+ const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd.output_zero_point);
+ for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
+ v128_t vacc0 = wasm_u16x8_load8x8(x);
+ v128_t vacc1 = wasm_u16x8_load8x8(x + 8);
+ v128_t vacc2 = wasm_u16x8_load8x8(x + 16);
+ v128_t vacc3 = wasm_u16x8_load8x8(x + 24);
+ x += 32;
+
+ vacc0 = wasm_i16x8_sub(vinput_zero_point, vacc0);
+ vacc1 = wasm_i16x8_sub(vinput_zero_point, vacc1);
+ vacc2 = wasm_i16x8_sub(vinput_zero_point, vacc2);
+ vacc3 = wasm_i16x8_sub(vinput_zero_point, vacc3);
+
+ vacc0 = wasm_i16x8_shl(vacc0, 7);
+ vacc1 = wasm_i16x8_shl(vacc1, 7);
+ vacc2 = wasm_i16x8_shl(vacc2, 7);
+ vacc3 = wasm_i16x8_shl(vacc3, 7);
+
+ vacc0 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc0, vmultiplier);
+ vacc1 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc1, vmultiplier);
+ vacc2 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc2, vmultiplier);
+ vacc3 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc3, vmultiplier);
+
+ vacc0 = wasm_i16x8_add_sat(vacc0, voutput_zero_point);
+ vacc1 = wasm_i16x8_add_sat(vacc1, voutput_zero_point);
+ vacc2 = wasm_i16x8_add_sat(vacc2, voutput_zero_point);
+ vacc3 = wasm_i16x8_add_sat(vacc3, voutput_zero_point);
+
+ const v128_t vy0 = wasm_u8x16_narrow_i16x8(vacc0, vacc1);
+ const v128_t vy1 = wasm_u8x16_narrow_i16x8(vacc2, vacc3);
+
+ wasm_v128_store(y, vy0);
+ wasm_v128_store((y + 16), vy1);
+ y += 32;
+ }
+ for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+ v128_t vacc = wasm_u16x8_load8x8(x);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+ x += 8;
+
+ const v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
+ wasm_v128_store64_lane(y, vy, 0);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(uint8_t));
+ assert(n <= 7 * sizeof(uint8_t));
+
+ v128_t vacc = wasm_u16x8_load8x8(x);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+
+ v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
+ if (n & (4 * sizeof(uint8_t))) {
+ wasm_v128_store32_lane(y, vy, 0);
+ vy = wasm_u64x2_shr(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ wasm_v128_store16_lane(y, vy, 0);
+ vy = wasm_u32x4_shr(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ wasm_v128_store8_lane(y, vy, 0);
+ }
+ }
+}
diff --git a/src/qu8-vcvt/gen/vcvt-wasmrelaxedsimd-x8.c b/src/qu8-vcvt/gen/vcvt-wasmrelaxedsimd-x8.c
new file mode 100644
index 0000000..5d51a13
--- /dev/null
+++ b/src/qu8-vcvt/gen/vcvt-wasmrelaxedsimd-x8.c
@@ -0,0 +1,69 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vcvt/wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x8(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(uint8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd.input_zero_point);
+ const v128_t vmultiplier = wasm_v128_load64_splat(params->wasmsimd.multiplier);
+ const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd.output_zero_point);
+ for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+ v128_t vacc = wasm_u16x8_load8x8(x);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+ x += 8;
+
+ const v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
+ wasm_v128_store64_lane(y, vy, 0);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(uint8_t));
+ assert(n <= 7 * sizeof(uint8_t));
+
+ v128_t vacc = wasm_u16x8_load8x8(x);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+
+ v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
+ if (n & (4 * sizeof(uint8_t))) {
+ wasm_v128_store32_lane(y, vy, 0);
+ vy = wasm_u64x2_shr(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ wasm_v128_store16_lane(y, vy, 0);
+ vy = wasm_u32x4_shr(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ wasm_v128_store8_lane(y, vy, 0);
+ }
+ }
+}
diff --git a/src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-arm-x16.c b/src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-arm-x16.c
new file mode 100644
index 0000000..c31e411
--- /dev/null
+++ b/src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-arm-x16.c
@@ -0,0 +1,99 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vlrelu/wasmsimd-arm.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(uint8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd_arm.input_zero_point);
+ const v128_t vpositive_multiplier = wasm_v128_load64_splat(params->wasmsimd_arm.positive_multiplier);
+ const v128_t vnegative_multiplier = wasm_v128_load64_splat(params->wasmsimd_arm.negative_multiplier);
+ const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd_arm.output_zero_point);
+ for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+ v128_t vx0 = wasm_v128_load(x);
+ x += 16;
+
+ v128_t vacc0 = wasm_i16x8_sub(vinput_zero_point, wasm_u16x8_extend_low_u8x16(vx0));
+ v128_t vacc1 = wasm_i16x8_sub(vinput_zero_point, wasm_u16x8_extend_high_u8x16(vx0));
+ v128_t vmultiplier0 = wasm_i16x8_shr(vacc0, 15);
+ v128_t vmultiplier1 = wasm_i16x8_shr(vacc1, 15);
+
+ vacc0 = wasm_i16x8_shl(vacc0, 7);
+ vmultiplier0 = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier0);
+ vacc1 = wasm_i16x8_shl(vacc1, 7);
+ vmultiplier1 = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier1);
+
+ vacc0 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc0, vmultiplier0);
+ vacc1 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc1, vmultiplier1);
+
+ vacc0 = wasm_i16x8_add_sat(vacc0, voutput_zero_point);
+ vacc1 = wasm_i16x8_add_sat(vacc1, voutput_zero_point);
+
+ const v128_t vy0 = wasm_u8x16_narrow_i16x8(vacc0, vacc1);
+
+ wasm_v128_store(y, vy0);
+ y += 16;
+ }
+ for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+ const v128_t vx = wasm_u16x8_load8x8(x);
+ v128_t vacc = wasm_i16x8_sub(vinput_zero_point, vx);
+ v128_t vmultiplier = wasm_i16x8_shr(vacc, 15);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vmultiplier = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+ x += 8;
+
+ const v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
+ wasm_v128_store64_lane(y, vy, 0);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(uint8_t));
+ assert(n <= 7 * sizeof(uint8_t));
+
+ const v128_t vx = wasm_u16x8_load8x8(x);
+ v128_t vacc = wasm_i16x8_sub(vinput_zero_point, vx);
+ v128_t vmultiplier = wasm_i16x8_shr(vacc, 15);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vmultiplier = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+
+ v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
+ if (n & (4 * sizeof(uint8_t))) {
+ wasm_v128_store32_lane(y, vy, 0);
+ vy = wasm_u64x2_shr(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ wasm_v128_store16_lane(y, vy, 0);
+ vy = wasm_u32x4_shr(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ wasm_v128_store8_lane(y, vy, 0);
+ }
+ }
+}
diff --git a/src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-arm-x32.c b/src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-arm-x32.c
new file mode 100644
index 0000000..dd03ece
--- /dev/null
+++ b/src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-arm-x32.c
@@ -0,0 +1,114 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vlrelu/wasmsimd-arm.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(uint8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd_arm.input_zero_point);
+ const v128_t vpositive_multiplier = wasm_v128_load64_splat(params->wasmsimd_arm.positive_multiplier);
+ const v128_t vnegative_multiplier = wasm_v128_load64_splat(params->wasmsimd_arm.negative_multiplier);
+ const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd_arm.output_zero_point);
+ for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
+ v128_t vx0 = wasm_v128_load(x);
+ v128_t vx1 = wasm_v128_load(x + 16);
+ x += 32;
+
+ v128_t vacc0 = wasm_i16x8_sub(vinput_zero_point, wasm_u16x8_extend_low_u8x16(vx0));
+ v128_t vacc1 = wasm_i16x8_sub(vinput_zero_point, wasm_u16x8_extend_high_u8x16(vx0));
+ v128_t vmultiplier0 = wasm_i16x8_shr(vacc0, 15);
+ v128_t vmultiplier1 = wasm_i16x8_shr(vacc1, 15);
+ v128_t vacc2 = wasm_i16x8_sub(vinput_zero_point, wasm_u16x8_extend_low_u8x16(vx1));
+ v128_t vacc3 = wasm_i16x8_sub(vinput_zero_point, wasm_u16x8_extend_high_u8x16(vx1));
+ v128_t vmultiplier2 = wasm_i16x8_shr(vacc2, 15);
+ v128_t vmultiplier3 = wasm_i16x8_shr(vacc3, 15);
+
+ vacc0 = wasm_i16x8_shl(vacc0, 7);
+ vmultiplier0 = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier0);
+ vacc1 = wasm_i16x8_shl(vacc1, 7);
+ vmultiplier1 = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier1);
+ vacc2 = wasm_i16x8_shl(vacc2, 7);
+ vmultiplier2 = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier2);
+ vacc3 = wasm_i16x8_shl(vacc3, 7);
+ vmultiplier3 = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier3);
+
+ vacc0 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc0, vmultiplier0);
+ vacc1 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc1, vmultiplier1);
+ vacc2 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc2, vmultiplier2);
+ vacc3 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc3, vmultiplier3);
+
+ vacc0 = wasm_i16x8_add_sat(vacc0, voutput_zero_point);
+ vacc1 = wasm_i16x8_add_sat(vacc1, voutput_zero_point);
+ vacc2 = wasm_i16x8_add_sat(vacc2, voutput_zero_point);
+ vacc3 = wasm_i16x8_add_sat(vacc3, voutput_zero_point);
+
+ const v128_t vy0 = wasm_u8x16_narrow_i16x8(vacc0, vacc1);
+ const v128_t vy1 = wasm_u8x16_narrow_i16x8(vacc2, vacc3);
+
+ wasm_v128_store(y, vy0);
+ wasm_v128_store((y + 16), vy1);
+ y += 32;
+ }
+ for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+ const v128_t vx = wasm_u16x8_load8x8(x);
+ v128_t vacc = wasm_i16x8_sub(vinput_zero_point, vx);
+ v128_t vmultiplier = wasm_i16x8_shr(vacc, 15);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vmultiplier = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+ x += 8;
+
+ const v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
+ wasm_v128_store64_lane(y, vy, 0);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(uint8_t));
+ assert(n <= 7 * sizeof(uint8_t));
+
+ const v128_t vx = wasm_u16x8_load8x8(x);
+ v128_t vacc = wasm_i16x8_sub(vinput_zero_point, vx);
+ v128_t vmultiplier = wasm_i16x8_shr(vacc, 15);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vmultiplier = wasm_v128_bitselect(vpositive_multiplier, vnegative_multiplier, vmultiplier);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+
+ v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
+ if (n & (4 * sizeof(uint8_t))) {
+ wasm_v128_store32_lane(y, vy, 0);
+ vy = wasm_u64x2_shr(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ wasm_v128_store16_lane(y, vy, 0);
+ vy = wasm_u32x4_shr(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ wasm_v128_store8_lane(y, vy, 0);
+ }
+ }
+}
diff --git a/src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x16.c b/src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x16.c
new file mode 100644
index 0000000..383753a
--- /dev/null
+++ b/src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x16.c
@@ -0,0 +1,104 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vlrelu/wasmsimd-x86.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(uint8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd_x86.input_zero_point);
+ const v128_t vmultiplier_diff = wasm_v128_load64_splat(params->wasmsimd_x86.multiplier_diff);
+ const v128_t vmultiplier_base = wasm_v128_load64_splat(params->wasmsimd_x86.multiplier_base);
+ const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd_x86.output_zero_point);
+ for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+ v128_t vacc0 = wasm_u16x8_load8x8(x);
+ v128_t vacc1 = wasm_u16x8_load8x8(x + 8);
+ x += 16;
+
+ v128_t vmultiplier0 = wasm_i16x8_gt(vacc0, vinput_zero_point);
+ vacc0 = wasm_i16x8_sub(vinput_zero_point, vacc0);
+ v128_t vmultiplier1 = wasm_i16x8_gt(vacc1, vinput_zero_point);
+ vacc1 = wasm_i16x8_sub(vinput_zero_point, vacc1);
+
+ vmultiplier0 = wasm_v128_and(vmultiplier0, vmultiplier_diff);
+ vacc0 = wasm_i16x8_shl(vacc0, 7);
+ vmultiplier0 = wasm_v128_xor(vmultiplier0, vmultiplier_base);
+ vmultiplier1 = wasm_v128_and(vmultiplier1, vmultiplier_diff);
+ vacc1 = wasm_i16x8_shl(vacc1, 7);
+ vmultiplier1 = wasm_v128_xor(vmultiplier1, vmultiplier_base);
+
+ vacc0 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc0, vmultiplier0);
+ vacc1 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc1, vmultiplier1);
+
+ vacc0 = wasm_i16x8_add_sat(vacc0, voutput_zero_point);
+ vacc1 = wasm_i16x8_add_sat(vacc1, voutput_zero_point);
+
+ const v128_t vy0 = wasm_u8x16_narrow_i16x8(vacc0, vacc1);
+
+ wasm_v128_store(y, vy0);
+ y += 16;
+ }
+ for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+ v128_t vacc = wasm_u16x8_load8x8(x);
+ v128_t vmultiplier = wasm_i16x8_gt(vacc, vinput_zero_point);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vmultiplier = wasm_v128_and(vmultiplier, vmultiplier_diff);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vmultiplier = wasm_v128_xor(vmultiplier, vmultiplier_base);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+ x += 8;
+
+ const v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
+ wasm_v128_store64_lane(y, vy, 0);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(uint8_t));
+ assert(n <= 7 * sizeof(uint8_t));
+
+ v128_t vacc = wasm_u16x8_load8x8(x);
+ v128_t vmultiplier = wasm_i16x8_gt(vacc, vinput_zero_point);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vmultiplier = wasm_v128_and(vmultiplier, vmultiplier_diff);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vmultiplier = wasm_v128_xor(vmultiplier, vmultiplier_base);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+
+ v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
+ if (n & (4 * sizeof(uint8_t))) {
+ wasm_v128_store32_lane(y, vy, 0);
+ vy = wasm_u64x2_shr(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ wasm_v128_store16_lane(y, vy, 0);
+ vy = wasm_u32x4_shr(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ wasm_v128_store8_lane(y, vy, 0);
+ }
+ }
+}
diff --git a/src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x32.c b/src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x32.c
new file mode 100644
index 0000000..2577f25
--- /dev/null
+++ b/src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x32.c
@@ -0,0 +1,122 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vlrelu/wasmsimd-x86.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(uint8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd_x86.input_zero_point);
+ const v128_t vmultiplier_diff = wasm_v128_load64_splat(params->wasmsimd_x86.multiplier_diff);
+ const v128_t vmultiplier_base = wasm_v128_load64_splat(params->wasmsimd_x86.multiplier_base);
+ const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd_x86.output_zero_point);
+ for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
+ v128_t vacc0 = wasm_u16x8_load8x8(x);
+ v128_t vacc1 = wasm_u16x8_load8x8(x + 8);
+ v128_t vacc2 = wasm_u16x8_load8x8(x + 16);
+ v128_t vacc3 = wasm_u16x8_load8x8(x + 24);
+ x += 32;
+
+ v128_t vmultiplier0 = wasm_i16x8_gt(vacc0, vinput_zero_point);
+ vacc0 = wasm_i16x8_sub(vinput_zero_point, vacc0);
+ v128_t vmultiplier1 = wasm_i16x8_gt(vacc1, vinput_zero_point);
+ vacc1 = wasm_i16x8_sub(vinput_zero_point, vacc1);
+ v128_t vmultiplier2 = wasm_i16x8_gt(vacc2, vinput_zero_point);
+ vacc2 = wasm_i16x8_sub(vinput_zero_point, vacc2);
+ v128_t vmultiplier3 = wasm_i16x8_gt(vacc3, vinput_zero_point);
+ vacc3 = wasm_i16x8_sub(vinput_zero_point, vacc3);
+
+ vmultiplier0 = wasm_v128_and(vmultiplier0, vmultiplier_diff);
+ vacc0 = wasm_i16x8_shl(vacc0, 7);
+ vmultiplier0 = wasm_v128_xor(vmultiplier0, vmultiplier_base);
+ vmultiplier1 = wasm_v128_and(vmultiplier1, vmultiplier_diff);
+ vacc1 = wasm_i16x8_shl(vacc1, 7);
+ vmultiplier1 = wasm_v128_xor(vmultiplier1, vmultiplier_base);
+ vmultiplier2 = wasm_v128_and(vmultiplier2, vmultiplier_diff);
+ vacc2 = wasm_i16x8_shl(vacc2, 7);
+ vmultiplier2 = wasm_v128_xor(vmultiplier2, vmultiplier_base);
+ vmultiplier3 = wasm_v128_and(vmultiplier3, vmultiplier_diff);
+ vacc3 = wasm_i16x8_shl(vacc3, 7);
+ vmultiplier3 = wasm_v128_xor(vmultiplier3, vmultiplier_base);
+
+ vacc0 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc0, vmultiplier0);
+ vacc1 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc1, vmultiplier1);
+ vacc2 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc2, vmultiplier2);
+ vacc3 = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc3, vmultiplier3);
+
+ vacc0 = wasm_i16x8_add_sat(vacc0, voutput_zero_point);
+ vacc1 = wasm_i16x8_add_sat(vacc1, voutput_zero_point);
+ vacc2 = wasm_i16x8_add_sat(vacc2, voutput_zero_point);
+ vacc3 = wasm_i16x8_add_sat(vacc3, voutput_zero_point);
+
+ const v128_t vy0 = wasm_u8x16_narrow_i16x8(vacc0, vacc1);
+ const v128_t vy1 = wasm_u8x16_narrow_i16x8(vacc2, vacc3);
+
+ wasm_v128_store(y, vy0);
+ wasm_v128_store((y + 16), vy1);
+ y += 32;
+ }
+ for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+ v128_t vacc = wasm_u16x8_load8x8(x);
+ v128_t vmultiplier = wasm_i16x8_gt(vacc, vinput_zero_point);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vmultiplier = wasm_v128_and(vmultiplier, vmultiplier_diff);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vmultiplier = wasm_v128_xor(vmultiplier, vmultiplier_base);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+ x += 8;
+
+ const v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
+ wasm_v128_store64_lane(y, vy, 0);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(uint8_t));
+ assert(n <= 7 * sizeof(uint8_t));
+
+ v128_t vacc = wasm_u16x8_load8x8(x);
+ v128_t vmultiplier = wasm_i16x8_gt(vacc, vinput_zero_point);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vmultiplier = wasm_v128_and(vmultiplier, vmultiplier_diff);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vmultiplier = wasm_v128_xor(vmultiplier, vmultiplier_base);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+
+ v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
+ if (n & (4 * sizeof(uint8_t))) {
+ wasm_v128_store32_lane(y, vy, 0);
+ vy = wasm_u64x2_shr(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ wasm_v128_store16_lane(y, vy, 0);
+ vy = wasm_u32x4_shr(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ wasm_v128_store8_lane(y, vy, 0);
+ }
+ }
+}
diff --git a/src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x8.c b/src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x8.c
new file mode 100644
index 0000000..cd7b1c8
--- /dev/null
+++ b/src/qu8-vlrelu/gen/vlrelu-wasmrelaxedsimd-x86-x8.c
@@ -0,0 +1,76 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vlrelu/wasmsimd-x86.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vcvt.h>
+
+
+void xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8(
+ size_t n,
+ const uint8_t* x,
+ uint8_t* y,
+ const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(n != 0);
+ assert(n % sizeof(uint8_t) == 0);
+ assert(x != NULL);
+ assert(y != NULL);
+
+ const v128_t vinput_zero_point = wasm_v128_load64_splat(params->wasmsimd_x86.input_zero_point);
+ const v128_t vmultiplier_diff = wasm_v128_load64_splat(params->wasmsimd_x86.multiplier_diff);
+ const v128_t vmultiplier_base = wasm_v128_load64_splat(params->wasmsimd_x86.multiplier_base);
+ const v128_t voutput_zero_point = wasm_v128_load64_splat(params->wasmsimd_x86.output_zero_point);
+ for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+ v128_t vacc = wasm_u16x8_load8x8(x);
+ v128_t vmultiplier = wasm_i16x8_gt(vacc, vinput_zero_point);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vmultiplier = wasm_v128_and(vmultiplier, vmultiplier_diff);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vmultiplier = wasm_v128_xor(vmultiplier, vmultiplier_base);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+ x += 8;
+
+ const v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
+ wasm_v128_store64_lane(y, vy, 0);
+ y += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ assert(n >= 1 * sizeof(uint8_t));
+ assert(n <= 7 * sizeof(uint8_t));
+
+ v128_t vacc = wasm_u16x8_load8x8(x);
+ v128_t vmultiplier = wasm_i16x8_gt(vacc, vinput_zero_point);
+ vacc = wasm_i16x8_sub(vinput_zero_point, vacc);
+ vmultiplier = wasm_v128_and(vmultiplier, vmultiplier_diff);
+ vacc = wasm_i16x8_shl(vacc, 7);
+ vmultiplier = wasm_v128_xor(vmultiplier, vmultiplier_base);
+ vacc = __builtin_wasm_relaxed_q15mulr_s_i16x8(vacc, vmultiplier);
+ vacc = wasm_i16x8_add_sat(vacc, voutput_zero_point);
+
+ v128_t vy = wasm_u8x16_narrow_i16x8(vacc, vacc);
+ if (n & (4 * sizeof(uint8_t))) {
+ wasm_v128_store32_lane(y, vy, 0);
+ vy = wasm_u64x2_shr(vy, 32);
+ y += 4;
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ wasm_v128_store16_lane(y, vy, 0);
+ vy = wasm_u32x4_shr(vy, 16);
+ y += 2;
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ wasm_v128_store8_lane(y, vy, 0);
+ }
+ }
+}
diff --git a/src/xnnpack/vcvt.h b/src/xnnpack/vcvt.h
index 3ca9d53..2caf242 100644
--- a/src/xnnpack/vcvt.h
+++ b/src/xnnpack/vcvt.h
@@ -314,6 +314,10 @@
DECLARE_QS8_VCVT_UKERNEL_FUNCTION(xnn_qs8_vcvt_ukernel__wasmsimd_x16)
DECLARE_QS8_VCVT_UKERNEL_FUNCTION(xnn_qs8_vcvt_ukernel__wasmsimd_x32)
+DECLARE_QS8_VCVT_UKERNEL_FUNCTION(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x8)
+DECLARE_QS8_VCVT_UKERNEL_FUNCTION(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x16)
+DECLARE_QS8_VCVT_UKERNEL_FUNCTION(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x32)
+
DECLARE_QS8_VCVT_UKERNEL_FUNCTION(xnn_qs8_vcvt_ukernel__armv6simd_x4)
DECLARE_QS8_VCVT_UKERNEL_FUNCTION(xnn_qs8_vcvt_ukernel__armv6simd_x8)
@@ -403,6 +407,10 @@
DECLARE_QU8_VCVT_UKERNEL_FUNCTION(xnn_qu8_vcvt_ukernel__wasmsimd_x16)
DECLARE_QU8_VCVT_UKERNEL_FUNCTION(xnn_qu8_vcvt_ukernel__wasmsimd_x32)
+DECLARE_QU8_VCVT_UKERNEL_FUNCTION(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x8)
+DECLARE_QU8_VCVT_UKERNEL_FUNCTION(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x16)
+DECLARE_QU8_VCVT_UKERNEL_FUNCTION(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x32)
+
DECLARE_QU8_VCVT_UKERNEL_FUNCTION(xnn_qu8_vcvt_ukernel__armv6simd_x4)
DECLARE_QU8_VCVT_UKERNEL_FUNCTION(xnn_qu8_vcvt_ukernel__armv6simd_x8)
diff --git a/src/xnnpack/vlrelu.h b/src/xnnpack/vlrelu.h
index 4c83e8e..3503a0a 100644
--- a/src/xnnpack/vlrelu.h
+++ b/src/xnnpack/vlrelu.h
@@ -52,6 +52,13 @@
DECLARE_QS8_VLRELU_UKERNEL_FUNCTION(xnn_qs8_vlrelu_ukernel__wasmsimd_x86_x16)
DECLARE_QS8_VLRELU_UKERNEL_FUNCTION(xnn_qs8_vlrelu_ukernel__wasmsimd_x86_x32)
+DECLARE_QS8_VLRELU_UKERNEL_FUNCTION(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16)
+DECLARE_QS8_VLRELU_UKERNEL_FUNCTION(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32)
+
+DECLARE_QS8_VLRELU_UKERNEL_FUNCTION(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8)
+DECLARE_QS8_VLRELU_UKERNEL_FUNCTION(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16)
+DECLARE_QS8_VLRELU_UKERNEL_FUNCTION(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32)
+
DECLARE_QS8_VLRELU_UKERNEL_FUNCTION(xnn_qs8_vlrelu_ukernel__armv6simd_x4)
DECLARE_QS8_VLRELU_UKERNEL_FUNCTION(xnn_qs8_vlrelu_ukernel__armv6simd_x8)
@@ -100,6 +107,13 @@
DECLARE_QU8_VLRELU_UKERNEL_FUNCTION(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x16)
DECLARE_QU8_VLRELU_UKERNEL_FUNCTION(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x32)
+DECLARE_QU8_VLRELU_UKERNEL_FUNCTION(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16)
+DECLARE_QU8_VLRELU_UKERNEL_FUNCTION(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32)
+
+DECLARE_QU8_VLRELU_UKERNEL_FUNCTION(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8)
+DECLARE_QU8_VLRELU_UKERNEL_FUNCTION(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16)
+DECLARE_QU8_VLRELU_UKERNEL_FUNCTION(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32)
+
DECLARE_QU8_VLRELU_UKERNEL_FUNCTION(xnn_qu8_vlrelu_ukernel__armv6simd_x4)
DECLARE_QU8_VLRELU_UKERNEL_FUNCTION(xnn_qu8_vlrelu_ukernel__armv6simd_x8)
diff --git a/test/qs8-vcvt.cc b/test/qs8-vcvt.cc
index 66cac24..2c112be 100644
--- a/test/qs8-vcvt.cc
+++ b/test/qs8-vcvt.cc
@@ -1611,6 +1611,240 @@
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+#if XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X8, batch_eq_8) {
+ VCvtMicrokernelTester()
+ .batch_size(8)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x8, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X8, batch_div_8) {
+ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x8, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X8, batch_lt_8) {
+ for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x8, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X8, batch_gt_8) {
+ for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x8, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X8, scale) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x8, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X8, input_zero_point) {
+ for (int16_t input_zero_point = 0; input_zero_point < 5; input_zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x8, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+ }
+ }
+
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X8, output_zero_point) {
+ for (int16_t output_zero_point = 0; output_zero_point < 5; output_zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .output_zero_point(output_zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x8, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X16, batch_eq_16) {
+ VCvtMicrokernelTester()
+ .batch_size(16)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x16, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X16, batch_div_16) {
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x16, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X16, batch_lt_16) {
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x16, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X16, batch_gt_16) {
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x16, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X16, scale) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x16, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X16, input_zero_point) {
+ for (int16_t input_zero_point = 0; input_zero_point < 5; input_zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x16, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+ }
+ }
+
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X16, output_zero_point) {
+ for (int16_t output_zero_point = 0; output_zero_point < 5; output_zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .output_zero_point(output_zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x16, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X32, batch_eq_32) {
+ VCvtMicrokernelTester()
+ .batch_size(32)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x32, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X32, batch_div_32) {
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x32, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X32, batch_lt_32) {
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x32, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X32, batch_gt_32) {
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x32, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X32, scale) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x32, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X32, input_zero_point) {
+ for (int16_t input_zero_point = 0; input_zero_point < 5; input_zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x32, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+ }
+ }
+
+ TEST(QS8_VCVT__WASMRELAXEDSIMD_X32, output_zero_point) {
+ for (int16_t output_zero_point = 0; output_zero_point < 5; output_zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .output_zero_point(output_zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .Test(xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x32, xnn_init_qs8_cvt_wasmsimd_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMRELAXEDSIMD
+
+
#if XNN_ARCH_ARM
TEST(QS8_VCVT__ARMV6SIMD_X4, batch_eq_4) {
TEST_REQUIRES_ARM_V6_SIMD;
diff --git a/test/qs8-vcvt.yaml b/test/qs8-vcvt.yaml
index a99b92d..3eafa53 100644
--- a/test/qs8-vcvt.yaml
+++ b/test/qs8-vcvt.yaml
@@ -53,6 +53,14 @@
- name: xnn_qs8_vcvt_ukernel__wasmsimd_x32
init: xnn_init_qs8_cvt_wasmsimd_params
+# WAsm Relaxed SIMD
+- name: xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x8
+ init: xnn_init_qs8_cvt_wasmsimd_params
+- name: xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x16
+ init: xnn_init_qs8_cvt_wasmsimd_params
+- name: xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x32
+ init: xnn_init_qs8_cvt_wasmsimd_params
+
# ARMv6 SIMD
- name: xnn_qs8_vcvt_ukernel__armv6simd_x4
init: xnn_init_qs8_cvt_armv6simd_params
diff --git a/test/qs8-vlrelu.cc b/test/qs8-vlrelu.cc
index 728f77e..7bf5c66 100644
--- a/test/qs8-vlrelu.cc
+++ b/test/qs8-vlrelu.cc
@@ -1762,6 +1762,391 @@
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+#if XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_ARM_X16, batch_eq_16) {
+ VLReLUMicrokernelTester()
+ .batch_size(16)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16, xnn_init_qs8_lrelu_wasmsimd_arm_params);
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_ARM_X16, batch_div_16) {
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16, xnn_init_qs8_lrelu_wasmsimd_arm_params);
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_ARM_X16, batch_lt_16) {
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16, xnn_init_qs8_lrelu_wasmsimd_arm_params);
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_ARM_X16, batch_gt_16) {
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16, xnn_init_qs8_lrelu_wasmsimd_arm_params);
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_ARM_X16, positive_scale) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (float positive_scale : std::vector<float>({1.0f / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .positive_scale(positive_scale)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16, xnn_init_qs8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_ARM_X16, negative_scale) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (float negative_scale : std::vector<float>({-127.99609375f, -1.3f, -0.3f, -1.0f / 256.0f, 1 / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .negative_scale(negative_scale)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16, xnn_init_qs8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_ARM_X16, input_zero_point) {
+ for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16, xnn_init_qs8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_ARM_X16, output_zero_point) {
+ for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .output_zero_point(output_zero_point)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16, xnn_init_qs8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_ARM_X32, batch_eq_32) {
+ VLReLUMicrokernelTester()
+ .batch_size(32)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32, xnn_init_qs8_lrelu_wasmsimd_arm_params);
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_ARM_X32, batch_div_32) {
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32, xnn_init_qs8_lrelu_wasmsimd_arm_params);
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_ARM_X32, batch_lt_32) {
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32, xnn_init_qs8_lrelu_wasmsimd_arm_params);
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_ARM_X32, batch_gt_32) {
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32, xnn_init_qs8_lrelu_wasmsimd_arm_params);
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_ARM_X32, positive_scale) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (float positive_scale : std::vector<float>({1.0f / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .positive_scale(positive_scale)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32, xnn_init_qs8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_ARM_X32, negative_scale) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (float negative_scale : std::vector<float>({-127.99609375f, -1.3f, -0.3f, -1.0f / 256.0f, 1 / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .negative_scale(negative_scale)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32, xnn_init_qs8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_ARM_X32, input_zero_point) {
+ for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32, xnn_init_qs8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_ARM_X32, output_zero_point) {
+ for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .output_zero_point(output_zero_point)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32, xnn_init_qs8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X8, batch_eq_8) {
+ VLReLUMicrokernelTester()
+ .batch_size(8)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X8, batch_div_8) {
+ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X8, batch_lt_8) {
+ for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X8, batch_gt_8) {
+ for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X8, positive_scale) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ for (float positive_scale : std::vector<float>({1.0f / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .positive_scale(positive_scale)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X8, negative_scale) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ for (float negative_scale : std::vector<float>({-127.99609375f, -1.3f, -0.3f, -1.0f / 256.0f, 1 / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .negative_scale(negative_scale)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X8, input_zero_point) {
+ for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X8, output_zero_point) {
+ for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .output_zero_point(output_zero_point)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X16, batch_eq_16) {
+ VLReLUMicrokernelTester()
+ .batch_size(16)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X16, batch_div_16) {
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X16, batch_lt_16) {
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X16, batch_gt_16) {
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X16, positive_scale) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (float positive_scale : std::vector<float>({1.0f / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .positive_scale(positive_scale)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X16, negative_scale) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (float negative_scale : std::vector<float>({-127.99609375f, -1.3f, -0.3f, -1.0f / 256.0f, 1 / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .negative_scale(negative_scale)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X16, input_zero_point) {
+ for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X16, output_zero_point) {
+ for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .output_zero_point(output_zero_point)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X32, batch_eq_32) {
+ VLReLUMicrokernelTester()
+ .batch_size(32)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X32, batch_div_32) {
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X32, batch_lt_32) {
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X32, batch_gt_32) {
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X32, positive_scale) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (float positive_scale : std::vector<float>({1.0f / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .positive_scale(positive_scale)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X32, negative_scale) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (float negative_scale : std::vector<float>({-127.99609375f, -1.3f, -0.3f, -1.0f / 256.0f, 1 / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .negative_scale(negative_scale)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X32, input_zero_point) {
+ for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QS8_VLRELU__WASMRELAXEDSIMD_X86_X32, output_zero_point) {
+ for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .output_zero_point(output_zero_point)
+ .Test(xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32, xnn_init_qs8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMRELAXEDSIMD
+
+
#if XNN_ARCH_ARM
TEST(QS8_VLRELU__ARMV6SIMD_X4, batch_eq_4) {
TEST_REQUIRES_ARM_V6_SIMD;
diff --git a/test/qs8-vlrelu.yaml b/test/qs8-vlrelu.yaml
index 230dc57..52707bd 100644
--- a/test/qs8-vlrelu.yaml
+++ b/test/qs8-vlrelu.yaml
@@ -61,6 +61,20 @@
- name: xnn_qs8_vlrelu_ukernel__wasmsimd_x86_x32
init: xnn_init_qs8_lrelu_wasmsimd_x86_params
+# WAsm Relaxed SIMD (ARM-optimized)
+- name: xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16
+ init: xnn_init_qs8_lrelu_wasmsimd_arm_params
+- name: xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32
+ init: xnn_init_qs8_lrelu_wasmsimd_arm_params
+
+# WAsm Relaxed SIMD (x86-optimized)
+- name: xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8
+ init: xnn_init_qs8_lrelu_wasmsimd_x86_params
+- name: xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16
+ init: xnn_init_qs8_lrelu_wasmsimd_x86_params
+- name: xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32
+ init: xnn_init_qs8_lrelu_wasmsimd_x86_params
+
# ARMv6 SIMD
- name: xnn_qs8_vlrelu_ukernel__armv6simd_x4
init: xnn_init_qs8_lrelu_armv6simd_params
diff --git a/test/qu8-vcvt.cc b/test/qu8-vcvt.cc
index 720ef8f..e0ce65a 100644
--- a/test/qu8-vcvt.cc
+++ b/test/qu8-vcvt.cc
@@ -1630,6 +1630,243 @@
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+#if XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X8, batch_eq_8) {
+ VCvtMicrokernelTester()
+ .batch_size(8)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x8, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X8, batch_div_8) {
+ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x8, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X8, batch_lt_8) {
+ for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x8, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X8, batch_gt_8) {
+ for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x8, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X8, scale) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .output_zero_point(100)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x8, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X8, input_zero_point) {
+ for (int16_t input_zero_point = 0; input_zero_point < 5; input_zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x8, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+ }
+ }
+
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X8, output_zero_point) {
+ for (int16_t output_zero_point = 0; output_zero_point < 5; output_zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .output_zero_point(output_zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x8, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X16, batch_eq_16) {
+ VCvtMicrokernelTester()
+ .batch_size(16)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x16, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X16, batch_div_16) {
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x16, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X16, batch_lt_16) {
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x16, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X16, batch_gt_16) {
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x16, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X16, scale) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .output_zero_point(100)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x16, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X16, input_zero_point) {
+ for (int16_t input_zero_point = 0; input_zero_point < 5; input_zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x16, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+ }
+ }
+
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X16, output_zero_point) {
+ for (int16_t output_zero_point = 0; output_zero_point < 5; output_zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .output_zero_point(output_zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x16, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X32, batch_eq_32) {
+ VCvtMicrokernelTester()
+ .batch_size(32)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x32, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X32, batch_div_32) {
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x32, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X32, batch_lt_32) {
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x32, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X32, batch_gt_32) {
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x32, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X32, scale) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .scale(50)
+ .output_zero_point(100)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x32, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+ }
+
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X32, input_zero_point) {
+ for (int16_t input_zero_point = 0; input_zero_point < 5; input_zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x32, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+ }
+ }
+
+ TEST(QU8_VCVT__WASMRELAXEDSIMD_X32, output_zero_point) {
+ for (int16_t output_zero_point = 0; output_zero_point < 5; output_zero_point += 2) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VCvtMicrokernelTester()
+ .batch_size(batch_size)
+ .output_zero_point(output_zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .Test(xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x32, xnn_init_qu8_cvt_wasmsimd_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMRELAXEDSIMD
+
+
#if XNN_ARCH_ARM
TEST(QU8_VCVT__ARMV6SIMD_X4, batch_eq_4) {
TEST_REQUIRES_ARM_V6_SIMD;
diff --git a/test/qu8-vcvt.yaml b/test/qu8-vcvt.yaml
index 9935a95..06a2c30 100644
--- a/test/qu8-vcvt.yaml
+++ b/test/qu8-vcvt.yaml
@@ -53,6 +53,14 @@
- name: xnn_qu8_vcvt_ukernel__wasmsimd_x32
init: xnn_init_qu8_cvt_wasmsimd_params
+# WAsm Relaxed SIMD
+- name: xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x8
+ init: xnn_init_qu8_cvt_wasmsimd_params
+- name: xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x16
+ init: xnn_init_qu8_cvt_wasmsimd_params
+- name: xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x32
+ init: xnn_init_qu8_cvt_wasmsimd_params
+
# ARMv6 SIMD
- name: xnn_qu8_vcvt_ukernel__armv6simd_x4
init: xnn_init_qu8_cvt_armv6simd_params
diff --git a/test/qu8-vlrelu.cc b/test/qu8-vlrelu.cc
index 495f7e1..413e73d 100644
--- a/test/qu8-vlrelu.cc
+++ b/test/qu8-vlrelu.cc
@@ -1601,6 +1601,916 @@
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QU8_VLRELU__WASMSIMD_ARM_X16, batch_eq_16) {
+ VLReLUMicrokernelTester()
+ .batch_size(16)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x16, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_ARM_X16, batch_div_16) {
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x16, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_ARM_X16, batch_lt_16) {
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x16, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_ARM_X16, batch_gt_16) {
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x16, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_ARM_X16, positive_scale) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (float positive_scale : std::vector<float>({1.0f / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .positive_scale(positive_scale)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x16, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_ARM_X16, negative_scale) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (float negative_scale : std::vector<float>({-127.99609375f, -1.3f, -0.3f, -1.0f / 256.0f, 1 / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .negative_scale(negative_scale)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x16, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_ARM_X16, input_zero_point) {
+ for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x16, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_ARM_X16, output_zero_point) {
+ for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(output_zero_point)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x16, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QU8_VLRELU__WASMSIMD_ARM_X32, batch_eq_32) {
+ VLReLUMicrokernelTester()
+ .batch_size(32)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x32, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_ARM_X32, batch_div_32) {
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x32, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_ARM_X32, batch_lt_32) {
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x32, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_ARM_X32, batch_gt_32) {
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x32, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_ARM_X32, positive_scale) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (float positive_scale : std::vector<float>({1.0f / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .positive_scale(positive_scale)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x32, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_ARM_X32, negative_scale) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (float negative_scale : std::vector<float>({-127.99609375f, -1.3f, -0.3f, -1.0f / 256.0f, 1 / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .negative_scale(negative_scale)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x32, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_ARM_X32, input_zero_point) {
+ for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x32, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_ARM_X32, output_zero_point) {
+ for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(output_zero_point)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x32, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QU8_VLRELU__WASMSIMD_X86_X8, batch_eq_8) {
+ VLReLUMicrokernelTester()
+ .batch_size(8)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x8, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X8, batch_div_8) {
+ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x8, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X8, batch_lt_8) {
+ for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x8, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X8, batch_gt_8) {
+ for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x8, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X8, positive_scale) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ for (float positive_scale : std::vector<float>({1.0f / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .positive_scale(positive_scale)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x8, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X8, negative_scale) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ for (float negative_scale : std::vector<float>({-127.99609375f, -1.3f, -0.3f, -1.0f / 256.0f, 1 / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .negative_scale(negative_scale)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x8, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X8, input_zero_point) {
+ for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x8, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X8, output_zero_point) {
+ for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(output_zero_point)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x8, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QU8_VLRELU__WASMSIMD_X86_X16, batch_eq_16) {
+ VLReLUMicrokernelTester()
+ .batch_size(16)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x16, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X16, batch_div_16) {
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x16, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X16, batch_lt_16) {
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x16, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X16, batch_gt_16) {
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x16, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X16, positive_scale) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (float positive_scale : std::vector<float>({1.0f / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .positive_scale(positive_scale)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x16, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X16, negative_scale) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (float negative_scale : std::vector<float>({-127.99609375f, -1.3f, -0.3f, -1.0f / 256.0f, 1 / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .negative_scale(negative_scale)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x16, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X16, input_zero_point) {
+ for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x16, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X16, output_zero_point) {
+ for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(output_zero_point)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x16, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QU8_VLRELU__WASMSIMD_X86_X32, batch_eq_32) {
+ VLReLUMicrokernelTester()
+ .batch_size(32)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x32, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X32, batch_div_32) {
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x32, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X32, batch_lt_32) {
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x32, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X32, batch_gt_32) {
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x32, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X32, positive_scale) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (float positive_scale : std::vector<float>({1.0f / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .positive_scale(positive_scale)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x32, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X32, negative_scale) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (float negative_scale : std::vector<float>({-127.99609375f, -1.3f, -0.3f, -1.0f / 256.0f, 1 / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .negative_scale(negative_scale)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x32, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X32, input_zero_point) {
+ for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x32, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMSIMD_X86_X32, output_zero_point) {
+ for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(output_zero_point)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x32, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_ARM_X16, batch_eq_16) {
+ VLReLUMicrokernelTester()
+ .batch_size(16)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_ARM_X16, batch_div_16) {
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_ARM_X16, batch_lt_16) {
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_ARM_X16, batch_gt_16) {
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_ARM_X16, positive_scale) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (float positive_scale : std::vector<float>({1.0f / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .positive_scale(positive_scale)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_ARM_X16, negative_scale) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (float negative_scale : std::vector<float>({-127.99609375f, -1.3f, -0.3f, -1.0f / 256.0f, 1 / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .negative_scale(negative_scale)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_ARM_X16, input_zero_point) {
+ for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_ARM_X16, output_zero_point) {
+ for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(output_zero_point)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_ARM_X32, batch_eq_32) {
+ VLReLUMicrokernelTester()
+ .batch_size(32)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_ARM_X32, batch_div_32) {
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_ARM_X32, batch_lt_32) {
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_ARM_X32, batch_gt_32) {
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_ARM_X32, positive_scale) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (float positive_scale : std::vector<float>({1.0f / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .positive_scale(positive_scale)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_ARM_X32, negative_scale) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (float negative_scale : std::vector<float>({-127.99609375f, -1.3f, -0.3f, -1.0f / 256.0f, 1 / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .negative_scale(negative_scale)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_ARM_X32, input_zero_point) {
+ for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_ARM_X32, output_zero_point) {
+ for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(output_zero_point)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32, xnn_init_qu8_lrelu_wasmsimd_arm_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X8, batch_eq_8) {
+ VLReLUMicrokernelTester()
+ .batch_size(8)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X8, batch_div_8) {
+ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X8, batch_lt_8) {
+ for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X8, batch_gt_8) {
+ for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X8, positive_scale) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ for (float positive_scale : std::vector<float>({1.0f / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .positive_scale(positive_scale)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X8, negative_scale) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ for (float negative_scale : std::vector<float>({-127.99609375f, -1.3f, -0.3f, -1.0f / 256.0f, 1 / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .negative_scale(negative_scale)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X8, input_zero_point) {
+ for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X8, output_zero_point) {
+ for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(output_zero_point)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X16, batch_eq_16) {
+ VLReLUMicrokernelTester()
+ .batch_size(16)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X16, batch_div_16) {
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X16, batch_lt_16) {
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X16, batch_gt_16) {
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X16, positive_scale) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (float positive_scale : std::vector<float>({1.0f / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .positive_scale(positive_scale)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X16, negative_scale) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (float negative_scale : std::vector<float>({-127.99609375f, -1.3f, -0.3f, -1.0f / 256.0f, 1 / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .negative_scale(negative_scale)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X16, input_zero_point) {
+ for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X16, output_zero_point) {
+ for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(output_zero_point)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMRELAXEDSIMD
+
+
+#if XNN_ARCH_WASMRELAXEDSIMD
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X32, batch_eq_32) {
+ VLReLUMicrokernelTester()
+ .batch_size(32)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X32, batch_div_32) {
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X32, batch_lt_32) {
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X32, batch_gt_32) {
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X32, positive_scale) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (float positive_scale : std::vector<float>({1.0f / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .positive_scale(positive_scale)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X32, negative_scale) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (float negative_scale : std::vector<float>({-127.99609375f, -1.3f, -0.3f, -1.0f / 256.0f, 1 / 256.0f, 0.3f, 1.3f, 128.0f})) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .negative_scale(negative_scale)
+ .input_zero_point(150)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X32, input_zero_point) {
+ for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(input_zero_point)
+ .output_zero_point(100)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+
+ TEST(QU8_VLRELU__WASMRELAXEDSIMD_X86_X32, output_zero_point) {
+ for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) {
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VLReLUMicrokernelTester()
+ .batch_size(batch_size)
+ .input_zero_point(150)
+ .output_zero_point(output_zero_point)
+ .Test(xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32, xnn_init_qu8_lrelu_wasmsimd_x86_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_WASMRELAXEDSIMD
+
+
#if XNN_ARCH_ARM
TEST(QU8_VLRELU__ARMV6SIMD_X4, batch_eq_4) {
TEST_REQUIRES_ARM_V6_SIMD;
diff --git a/test/qu8-vlrelu.yaml b/test/qu8-vlrelu.yaml
index d8323b8..7951edb 100644
--- a/test/qu8-vlrelu.yaml
+++ b/test/qu8-vlrelu.yaml
@@ -47,6 +47,34 @@
- name: xnn_qu8_vlrelu_ukernel__avx2_x64
init: xnn_init_qu8_lrelu_avx2_params
+# WAsm SIMD (ARM-optimized)
+- name: xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x16
+ init: xnn_init_qu8_lrelu_wasmsimd_arm_params
+- name: xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x32
+ init: xnn_init_qu8_lrelu_wasmsimd_arm_params
+
+# WAsm SIMD (x86-optimized)
+- name: xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x8
+ init: xnn_init_qu8_lrelu_wasmsimd_x86_params
+- name: xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x16
+ init: xnn_init_qu8_lrelu_wasmsimd_x86_params
+- name: xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x32
+ init: xnn_init_qu8_lrelu_wasmsimd_x86_params
+
+# WAsm Relaxed SIMD (ARM-optimized)
+- name: xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x16
+ init: xnn_init_qu8_lrelu_wasmsimd_arm_params
+- name: xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32
+ init: xnn_init_qu8_lrelu_wasmsimd_arm_params
+
+# WAsm Relaxed SIMD (x86-optimized)
+- name: xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x8
+ init: xnn_init_qu8_lrelu_wasmsimd_x86_params
+- name: xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x16
+ init: xnn_init_qu8_lrelu_wasmsimd_x86_params
+- name: xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32
+ init: xnn_init_qu8_lrelu_wasmsimd_x86_params
+
# ARMv6 SIMD
- name: xnn_qu8_vlrelu_ukernel__armv6simd_x4
init: xnn_init_qu8_lrelu_armv6simd_params