F16 VSIGMOID microkernels for NEON+FP16ARITH
PiperOrigin-RevId: 429159816
diff --git a/BUILD.bazel b/BUILD.bazel
index ef6d13b..c9f7333 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -4549,6 +4549,22 @@
"src/f16-vlrelu/gen/vlrelu-neonfp16arith-x16.c",
"src/f16-vmulcaddc/gen/c8-minmax-neonfp16arith-2x.c",
"src/f16-vmulcaddc/gen/c16-minmax-neonfp16arith-2x.c",
+ "src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x8.c",
+ "src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x16.c",
+ "src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x24.c",
+ "src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x32.c",
+ "src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x40.c",
+ "src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x48.c",
+ "src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x56.c",
+ "src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x64.c",
+ "src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x8.c",
+ "src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x16.c",
+ "src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x24.c",
+ "src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x32.c",
+ "src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x40.c",
+ "src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x48.c",
+ "src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x56.c",
+ "src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x64.c",
"src/math/exp-f16-neonfp16arith-rr2-p3.c",
"src/math/expminus-f16-neonfp16arith-rr2-p3.c",
"src/math/sigmoid-f16-neonfp16arith-rr1-p3-div.c",
@@ -10011,6 +10027,15 @@
)
xnnpack_benchmark(
+ name = "f16_vsigmoid_bench",
+ srcs = [
+ "bench/f16-vsigmoid.cc",
+ "src/xnnpack/AlignedAllocator.h",
+ ] + MICROKERNEL_BENCHMARK_HDRS,
+ deps = MICROKERNEL_BENCHMARK_DEPS,
+)
+
+xnnpack_benchmark(
name = "f16_f32_vcvt_bench",
srcs = [
"bench/f16-f32-vcvt.cc",
@@ -11294,6 +11319,15 @@
)
xnnpack_unit_test(
+ name = "f16_vsigmoid_test",
+ srcs = [
+ "test/f16-vsigmoid.cc",
+ "test/vunary-microkernel-tester.h",
+ ] + MICROKERNEL_TEST_HDRS,
+ deps = MICROKERNEL_TEST_DEPS,
+)
+
+xnnpack_unit_test(
name = "f16_vsub_minmax_test",
srcs = [
"test/f16-vsub-minmax.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 732ecc6..5ee04c3 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3302,6 +3302,22 @@
src/f16-vlrelu/gen/vlrelu-neonfp16arith-x16.c
src/f16-vmulcaddc/gen/c8-minmax-neonfp16arith-2x.c
src/f16-vmulcaddc/gen/c16-minmax-neonfp16arith-2x.c
+ src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x8.c
+ src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x16.c
+ src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x24.c
+ src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x32.c
+ src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x40.c
+ src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x48.c
+ src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x56.c
+ src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x64.c
+ src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x8.c
+ src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x16.c
+ src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x24.c
+ src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x32.c
+ src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x40.c
+ src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x48.c
+ src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x56.c
+ src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x64.c
src/math/exp-f16-neonfp16arith-rr2-p3.c
src/math/expminus-f16-neonfp16arith-rr2-p3.c
src/math/sigmoid-f16-neonfp16arith-rr1-p3-div.c
@@ -7299,6 +7315,15 @@
TARGET_LINK_LIBRARIES(f16-prelu-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
ADD_TEST(f16-prelu-test f16-prelu-test)
+ ADD_EXECUTABLE(f16-vsigmoid-test test/f16-vsigmoid.cc $<TARGET_OBJECTS:all_microkernels>)
+ SET_TARGET_PROPERTIES(f16-vsigmoid-test PROPERTIES
+ CXX_STANDARD 11
+ CXX_STANDARD_REQUIRED YES
+ CXX_EXTENSIONS YES)
+ TARGET_INCLUDE_DIRECTORIES(f16-vsigmoid-test PRIVATE include src test)
+ TARGET_LINK_LIBRARIES(f16-vsigmoid-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
+ ADD_TEST(f16-vsigmoid-test f16-vsigmoid-test)
+
ADD_EXECUTABLE(f16-vsub-minmax-test test/f16-vsub-minmax.cc $<TARGET_OBJECTS:all_microkernels>)
SET_TARGET_PROPERTIES(f16-vsub-minmax-test PROPERTIES
CXX_STANDARD 11
@@ -9046,6 +9071,14 @@
TARGET_INCLUDE_DIRECTORIES(f16-igemm-bench PRIVATE . include src)
TARGET_LINK_LIBRARIES(f16-igemm-bench PRIVATE benchmark bench-utils cpuinfo fp16 pthreadpool)
+ ADD_EXECUTABLE(f16-vsigmoid-bench bench/f16-vsigmoid.cc $<TARGET_OBJECTS:all_microkernels>)
+ SET_TARGET_PROPERTIES(f16-vsigmoid-bench PROPERTIES
+ CXX_STANDARD 11
+ CXX_STANDARD_REQUIRED YES
+ CXX_EXTENSIONS YES)
+ TARGET_INCLUDE_DIRECTORIES(f16-vsigmoid-bench PRIVATE . include src)
+ TARGET_LINK_LIBRARIES(f16-vsigmoid-bench PRIVATE benchmark bench-utils fp16 pthreadpool)
+
ADD_EXECUTABLE(f16-f32-vcvt-bench bench/f16-f32-vcvt.cc $<TARGET_OBJECTS:all_microkernels>)
SET_TARGET_PROPERTIES(f16-f32-vcvt-bench PROPERTIES
CXX_STANDARD 11
diff --git a/bench/f16-vsigmoid.cc b/bench/f16-vsigmoid.cc
new file mode 100644
index 0000000..99ed596
--- /dev/null
+++ b/bench/f16-vsigmoid.cc
@@ -0,0 +1,166 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/utils.h"
+#include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/params-init.h>
+#include <xnnpack/params.h>
+#include <xnnpack/vunary.h>
+
+
+static void f16_vsigmoid(
+ benchmark::State& state,
+ xnn_f16_vsigmoid_ukernel_function sigmoid,
+ xnn_init_f16_sigmoid_params_fn init_params,
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+ if (isa_check && !isa_check(state)) {
+ return;
+ }
+
+ const size_t num_elements = state.range(0);
+
+ std::random_device random_device;
+ auto rng = std::mt19937(random_device());
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(num_elements);
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(num_elements);
+ std::generate(x.begin(), x.end(), std::ref(f16rng));
+ std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
+
+ xnn_f16_sigmoid_params params;
+ init_params(¶ms);
+ for (auto _ : state) {
+ sigmoid(num_elements * sizeof(uint16_t), x.data(), y.data(), ¶ms);
+ }
+
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+ if (cpu_frequency != 0) {
+ state.counters["cpufreq"] = cpu_frequency;
+ }
+
+ const size_t elements_per_iteration = num_elements;
+ state.counters["elements"] =
+ benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+
+ const size_t bytes_per_iteration = 2 * num_elements * sizeof(uint16_t);
+ state.counters["bytes"] =
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+
+#if XNN_ARCH_ARM64
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr1_p3_div_x8,
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x8,
+ xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params,
+ benchmark::utils::CheckNEONFP16ARITH)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr1_p3_div_x16,
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x16,
+ xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params,
+ benchmark::utils::CheckNEONFP16ARITH)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr1_p3_div_x24,
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x24,
+ xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params,
+ benchmark::utils::CheckNEONFP16ARITH)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr1_p3_div_x32,
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x32,
+ xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params,
+ benchmark::utils::CheckNEONFP16ARITH)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr1_p3_div_x40,
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x40,
+ xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params,
+ benchmark::utils::CheckNEONFP16ARITH)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr1_p3_div_x48,
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x48,
+ xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params,
+ benchmark::utils::CheckNEONFP16ARITH)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr1_p3_div_x56,
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x56,
+ xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params,
+ benchmark::utils::CheckNEONFP16ARITH)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr1_p3_div_x64,
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x64,
+ xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params,
+ benchmark::utils::CheckNEONFP16ARITH)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+ ->UseRealTime();
+
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr1_p3_recpe_x8,
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x8,
+ xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params,
+ benchmark::utils::CheckNEONFP16ARITH)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr1_p3_recpe_x16,
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x16,
+ xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params,
+ benchmark::utils::CheckNEONFP16ARITH)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr1_p3_recpe_x24,
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x24,
+ xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params,
+ benchmark::utils::CheckNEONFP16ARITH)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr1_p3_recpe_x32,
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x32,
+ xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params,
+ benchmark::utils::CheckNEONFP16ARITH)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr1_p3_recpe_x40,
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x40,
+ xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params,
+ benchmark::utils::CheckNEONFP16ARITH)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr1_p3_recpe_x48,
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x48,
+ xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params,
+ benchmark::utils::CheckNEONFP16ARITH)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr1_p3_recpe_x56,
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x56,
+ xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params,
+ benchmark::utils::CheckNEONFP16ARITH)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr1_p3_recpe_x64,
+ xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x64,
+ xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params,
+ benchmark::utils::CheckNEONFP16ARITH)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
+ ->UseRealTime();
+#endif // XNN_ARCH_ARM64
+
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif
diff --git a/scripts/generate-f16-vsigmoid.sh b/scripts/generate-f16-vsigmoid.sh
new file mode 100755
index 0000000..b5c5805
--- /dev/null
+++ b/scripts/generate-f16-vsigmoid.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+# Copyright 2022 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+################################### ARM NEON ##################################
+tools/xngen src/f16-vsigmoid/neonfp16arith.c.in -D BATCH_TILE=8 -D DIV_ALGO=div -o src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x8.c &
+tools/xngen src/f16-vsigmoid/neonfp16arith.c.in -D BATCH_TILE=16 -D DIV_ALGO=div -o src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x16.c &
+tools/xngen src/f16-vsigmoid/neonfp16arith.c.in -D BATCH_TILE=24 -D DIV_ALGO=div -o src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x24.c &
+tools/xngen src/f16-vsigmoid/neonfp16arith.c.in -D BATCH_TILE=32 -D DIV_ALGO=div -o src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x32.c &
+tools/xngen src/f16-vsigmoid/neonfp16arith.c.in -D BATCH_TILE=40 -D DIV_ALGO=div -o src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x40.c &
+tools/xngen src/f16-vsigmoid/neonfp16arith.c.in -D BATCH_TILE=48 -D DIV_ALGO=div -o src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x48.c &
+tools/xngen src/f16-vsigmoid/neonfp16arith.c.in -D BATCH_TILE=56 -D DIV_ALGO=div -o src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x56.c &
+tools/xngen src/f16-vsigmoid/neonfp16arith.c.in -D BATCH_TILE=64 -D DIV_ALGO=div -o src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x64.c &
+
+tools/xngen src/f16-vsigmoid/neonfp16arith.c.in -D BATCH_TILE=8 -D DIV_ALGO=recpe -o src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x8.c &
+tools/xngen src/f16-vsigmoid/neonfp16arith.c.in -D BATCH_TILE=16 -D DIV_ALGO=recpe -o src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x16.c &
+tools/xngen src/f16-vsigmoid/neonfp16arith.c.in -D BATCH_TILE=24 -D DIV_ALGO=recpe -o src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x24.c &
+tools/xngen src/f16-vsigmoid/neonfp16arith.c.in -D BATCH_TILE=32 -D DIV_ALGO=recpe -o src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x32.c &
+tools/xngen src/f16-vsigmoid/neonfp16arith.c.in -D BATCH_TILE=40 -D DIV_ALGO=recpe -o src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x40.c &
+tools/xngen src/f16-vsigmoid/neonfp16arith.c.in -D BATCH_TILE=48 -D DIV_ALGO=recpe -o src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x48.c &
+tools/xngen src/f16-vsigmoid/neonfp16arith.c.in -D BATCH_TILE=56 -D DIV_ALGO=recpe -o src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x56.c &
+tools/xngen src/f16-vsigmoid/neonfp16arith.c.in -D BATCH_TILE=64 -D DIV_ALGO=recpe -o src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x64.c &
+
+################################## Unit tests #################################
+tools/generate-vunary-test.py --spec test/f16-vsigmoid.yaml --output test/f16-vsigmoid.cc &
+
+wait
diff --git a/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x16.c b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x16.c
new file mode 100644
index 0000000..c27c41f
--- /dev/null
+++ b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x16.c
@@ -0,0 +1,144 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-vsigmoid/neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x16(
+ size_t batch,
+ const void* input,
+ void* output,
+ const union xnn_f16_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(batch % sizeof(__fp16) == 0);
+
+ const float16x8_t vmagic_bias = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.magic_bias));
+ const float16x8_t vminus_log2e = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.minus_log2e));
+ const float16x8_t vln2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.ln2));
+ const float16x8_t vc3 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c3));
+ const float16x8_t vc2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c2));
+ const float16x8_t vone = vmovq_n_f16(1.0f);
+ const float16x8_t vdenorm_cutoff = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.denorm_cutoff));
+
+ const __fp16* i = (const __fp16*) input;
+ __fp16* o = (__fp16*) output;
+ for (; batch >= 16 * sizeof(__fp16); batch -= 16 * sizeof(__fp16)) {
+ const float16x8_t vx0 = vld1q_f16(i); i += 8;
+ const float16x8_t vx1 = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz0 = vabsq_f16(vx0);
+ const float16x8_t vz1 = vabsq_f16(vx1);
+
+ float16x8_t vn0 = vfmaq_f16(vmagic_bias, vz0, vminus_log2e);
+ float16x8_t vn1 = vfmaq_f16(vmagic_bias, vz1, vminus_log2e);
+
+ const float16x8_t vs0 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn0), 10));
+ const float16x8_t vs1 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn1), 10));
+
+ vn0 = vsubq_f16(vn0, vmagic_bias);
+ vn1 = vsubq_f16(vn1, vmagic_bias);
+
+ float16x8_t vt0 = vfmaq_f16(vz0, vn0, vln2);
+ float16x8_t vt1 = vfmaq_f16(vz1, vn1, vln2);
+
+ float16x8_t vp0 = vfmaq_f16(vc2, vc3, vt0);
+ float16x8_t vp1 = vfmaq_f16(vc2, vc3, vt1);
+
+ vp0 = vfmsq_f16(vone, vp0, vt0);
+ vp1 = vfmsq_f16(vone, vp1, vt1);
+
+ vt0 = vmulq_f16(vt0, vs0);
+ vt1 = vmulq_f16(vt1, vs1);
+
+ const float16x8_t ve0 = vfmsq_f16(vs0, vp0, vt0);
+ const float16x8_t ve1 = vfmsq_f16(vs1, vp1, vt1);
+
+ const float16x8_t vd0 = vaddq_f16(ve0, vone);
+ const float16x8_t vd1 = vaddq_f16(ve1, vone);
+
+ float16x8_t vf0 = vdivq_f16(ve0, vd0);
+ float16x8_t vf1 = vdivq_f16(ve1, vd1);
+
+ vf0 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf0), vcagtq_f16(vx0, vdenorm_cutoff)));
+ vf1 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf1), vcagtq_f16(vx1, vdenorm_cutoff)));
+
+ const uint16x8_t vm0 = vcltq_f16(vx0, vmovq_n_f16(0.0f));
+ const uint16x8_t vm1 = vcltq_f16(vx1, vmovq_n_f16(0.0f));
+
+ vf0 = vbslq_f16(vm0, vf0, vsubq_f16(vone, vf0));
+ vf1 = vbslq_f16(vm1, vf1, vsubq_f16(vone, vf1));
+
+ vst1q_f16(o, vf0); o += 8;
+ vst1q_f16(o, vf1); o += 8;
+ }
+ for (; batch >= 8 * sizeof(__fp16); batch -= 8 * sizeof(__fp16)) {
+ const float16x8_t vx = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ float16x8_t vf = vdivq_f16(ve, vd);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ vst1q_f16(o, vf); o += 8;
+ }
+ if XNN_UNLIKELY(batch != 0) {
+ const float16x8_t vx = vld1q_f16(i);
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ float16x8_t vf = vdivq_f16(ve, vd);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ float16x4_t vf_lo = vget_low_f16(vf);
+ if (batch & (4 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 4;
+ vf_lo = vget_high_f16(vf);
+ }
+ if (batch & (2 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 2;
+ vf_lo = vext_f16(vf_lo, vf_lo, 2);
+ }
+ if (batch & (1 * sizeof(__fp16))) {
+ vst1_lane_f16(o, vf_lo, 0);
+ }
+ }
+}
diff --git a/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x24.c b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x24.c
new file mode 100644
index 0000000..c46759a
--- /dev/null
+++ b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x24.c
@@ -0,0 +1,160 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-vsigmoid/neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x24(
+ size_t batch,
+ const void* input,
+ void* output,
+ const union xnn_f16_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(batch % sizeof(__fp16) == 0);
+
+ const float16x8_t vmagic_bias = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.magic_bias));
+ const float16x8_t vminus_log2e = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.minus_log2e));
+ const float16x8_t vln2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.ln2));
+ const float16x8_t vc3 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c3));
+ const float16x8_t vc2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c2));
+ const float16x8_t vone = vmovq_n_f16(1.0f);
+ const float16x8_t vdenorm_cutoff = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.denorm_cutoff));
+
+ const __fp16* i = (const __fp16*) input;
+ __fp16* o = (__fp16*) output;
+ for (; batch >= 24 * sizeof(__fp16); batch -= 24 * sizeof(__fp16)) {
+ const float16x8_t vx0 = vld1q_f16(i); i += 8;
+ const float16x8_t vx1 = vld1q_f16(i); i += 8;
+ const float16x8_t vx2 = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz0 = vabsq_f16(vx0);
+ const float16x8_t vz1 = vabsq_f16(vx1);
+ const float16x8_t vz2 = vabsq_f16(vx2);
+
+ float16x8_t vn0 = vfmaq_f16(vmagic_bias, vz0, vminus_log2e);
+ float16x8_t vn1 = vfmaq_f16(vmagic_bias, vz1, vminus_log2e);
+ float16x8_t vn2 = vfmaq_f16(vmagic_bias, vz2, vminus_log2e);
+
+ const float16x8_t vs0 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn0), 10));
+ const float16x8_t vs1 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn1), 10));
+ const float16x8_t vs2 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn2), 10));
+
+ vn0 = vsubq_f16(vn0, vmagic_bias);
+ vn1 = vsubq_f16(vn1, vmagic_bias);
+ vn2 = vsubq_f16(vn2, vmagic_bias);
+
+ float16x8_t vt0 = vfmaq_f16(vz0, vn0, vln2);
+ float16x8_t vt1 = vfmaq_f16(vz1, vn1, vln2);
+ float16x8_t vt2 = vfmaq_f16(vz2, vn2, vln2);
+
+ float16x8_t vp0 = vfmaq_f16(vc2, vc3, vt0);
+ float16x8_t vp1 = vfmaq_f16(vc2, vc3, vt1);
+ float16x8_t vp2 = vfmaq_f16(vc2, vc3, vt2);
+
+ vp0 = vfmsq_f16(vone, vp0, vt0);
+ vp1 = vfmsq_f16(vone, vp1, vt1);
+ vp2 = vfmsq_f16(vone, vp2, vt2);
+
+ vt0 = vmulq_f16(vt0, vs0);
+ vt1 = vmulq_f16(vt1, vs1);
+ vt2 = vmulq_f16(vt2, vs2);
+
+ const float16x8_t ve0 = vfmsq_f16(vs0, vp0, vt0);
+ const float16x8_t ve1 = vfmsq_f16(vs1, vp1, vt1);
+ const float16x8_t ve2 = vfmsq_f16(vs2, vp2, vt2);
+
+ const float16x8_t vd0 = vaddq_f16(ve0, vone);
+ const float16x8_t vd1 = vaddq_f16(ve1, vone);
+ const float16x8_t vd2 = vaddq_f16(ve2, vone);
+
+ float16x8_t vf0 = vdivq_f16(ve0, vd0);
+ float16x8_t vf1 = vdivq_f16(ve1, vd1);
+ float16x8_t vf2 = vdivq_f16(ve2, vd2);
+
+ vf0 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf0), vcagtq_f16(vx0, vdenorm_cutoff)));
+ vf1 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf1), vcagtq_f16(vx1, vdenorm_cutoff)));
+ vf2 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf2), vcagtq_f16(vx2, vdenorm_cutoff)));
+
+ const uint16x8_t vm0 = vcltq_f16(vx0, vmovq_n_f16(0.0f));
+ const uint16x8_t vm1 = vcltq_f16(vx1, vmovq_n_f16(0.0f));
+ const uint16x8_t vm2 = vcltq_f16(vx2, vmovq_n_f16(0.0f));
+
+ vf0 = vbslq_f16(vm0, vf0, vsubq_f16(vone, vf0));
+ vf1 = vbslq_f16(vm1, vf1, vsubq_f16(vone, vf1));
+ vf2 = vbslq_f16(vm2, vf2, vsubq_f16(vone, vf2));
+
+ vst1q_f16(o, vf0); o += 8;
+ vst1q_f16(o, vf1); o += 8;
+ vst1q_f16(o, vf2); o += 8;
+ }
+ for (; batch >= 8 * sizeof(__fp16); batch -= 8 * sizeof(__fp16)) {
+ const float16x8_t vx = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ float16x8_t vf = vdivq_f16(ve, vd);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ vst1q_f16(o, vf); o += 8;
+ }
+ if XNN_UNLIKELY(batch != 0) {
+ const float16x8_t vx = vld1q_f16(i);
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ float16x8_t vf = vdivq_f16(ve, vd);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ float16x4_t vf_lo = vget_low_f16(vf);
+ if (batch & (4 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 4;
+ vf_lo = vget_high_f16(vf);
+ }
+ if (batch & (2 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 2;
+ vf_lo = vext_f16(vf_lo, vf_lo, 2);
+ }
+ if (batch & (1 * sizeof(__fp16))) {
+ vst1_lane_f16(o, vf_lo, 0);
+ }
+ }
+}
diff --git a/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x32.c b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x32.c
new file mode 100644
index 0000000..096b9fe
--- /dev/null
+++ b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x32.c
@@ -0,0 +1,176 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-vsigmoid/neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x32(
+ size_t batch,
+ const void* input,
+ void* output,
+ const union xnn_f16_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(batch % sizeof(__fp16) == 0);
+
+ const float16x8_t vmagic_bias = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.magic_bias));
+ const float16x8_t vminus_log2e = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.minus_log2e));
+ const float16x8_t vln2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.ln2));
+ const float16x8_t vc3 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c3));
+ const float16x8_t vc2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c2));
+ const float16x8_t vone = vmovq_n_f16(1.0f);
+ const float16x8_t vdenorm_cutoff = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.denorm_cutoff));
+
+ const __fp16* i = (const __fp16*) input;
+ __fp16* o = (__fp16*) output;
+ for (; batch >= 32 * sizeof(__fp16); batch -= 32 * sizeof(__fp16)) {
+ const float16x8_t vx0 = vld1q_f16(i); i += 8;
+ const float16x8_t vx1 = vld1q_f16(i); i += 8;
+ const float16x8_t vx2 = vld1q_f16(i); i += 8;
+ const float16x8_t vx3 = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz0 = vabsq_f16(vx0);
+ const float16x8_t vz1 = vabsq_f16(vx1);
+ const float16x8_t vz2 = vabsq_f16(vx2);
+ const float16x8_t vz3 = vabsq_f16(vx3);
+
+ float16x8_t vn0 = vfmaq_f16(vmagic_bias, vz0, vminus_log2e);
+ float16x8_t vn1 = vfmaq_f16(vmagic_bias, vz1, vminus_log2e);
+ float16x8_t vn2 = vfmaq_f16(vmagic_bias, vz2, vminus_log2e);
+ float16x8_t vn3 = vfmaq_f16(vmagic_bias, vz3, vminus_log2e);
+
+ const float16x8_t vs0 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn0), 10));
+ const float16x8_t vs1 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn1), 10));
+ const float16x8_t vs2 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn2), 10));
+ const float16x8_t vs3 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn3), 10));
+
+ vn0 = vsubq_f16(vn0, vmagic_bias);
+ vn1 = vsubq_f16(vn1, vmagic_bias);
+ vn2 = vsubq_f16(vn2, vmagic_bias);
+ vn3 = vsubq_f16(vn3, vmagic_bias);
+
+ float16x8_t vt0 = vfmaq_f16(vz0, vn0, vln2);
+ float16x8_t vt1 = vfmaq_f16(vz1, vn1, vln2);
+ float16x8_t vt2 = vfmaq_f16(vz2, vn2, vln2);
+ float16x8_t vt3 = vfmaq_f16(vz3, vn3, vln2);
+
+ float16x8_t vp0 = vfmaq_f16(vc2, vc3, vt0);
+ float16x8_t vp1 = vfmaq_f16(vc2, vc3, vt1);
+ float16x8_t vp2 = vfmaq_f16(vc2, vc3, vt2);
+ float16x8_t vp3 = vfmaq_f16(vc2, vc3, vt3);
+
+ vp0 = vfmsq_f16(vone, vp0, vt0);
+ vp1 = vfmsq_f16(vone, vp1, vt1);
+ vp2 = vfmsq_f16(vone, vp2, vt2);
+ vp3 = vfmsq_f16(vone, vp3, vt3);
+
+ vt0 = vmulq_f16(vt0, vs0);
+ vt1 = vmulq_f16(vt1, vs1);
+ vt2 = vmulq_f16(vt2, vs2);
+ vt3 = vmulq_f16(vt3, vs3);
+
+ const float16x8_t ve0 = vfmsq_f16(vs0, vp0, vt0);
+ const float16x8_t ve1 = vfmsq_f16(vs1, vp1, vt1);
+ const float16x8_t ve2 = vfmsq_f16(vs2, vp2, vt2);
+ const float16x8_t ve3 = vfmsq_f16(vs3, vp3, vt3);
+
+ const float16x8_t vd0 = vaddq_f16(ve0, vone);
+ const float16x8_t vd1 = vaddq_f16(ve1, vone);
+ const float16x8_t vd2 = vaddq_f16(ve2, vone);
+ const float16x8_t vd3 = vaddq_f16(ve3, vone);
+
+ float16x8_t vf0 = vdivq_f16(ve0, vd0);
+ float16x8_t vf1 = vdivq_f16(ve1, vd1);
+ float16x8_t vf2 = vdivq_f16(ve2, vd2);
+ float16x8_t vf3 = vdivq_f16(ve3, vd3);
+
+ vf0 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf0), vcagtq_f16(vx0, vdenorm_cutoff)));
+ vf1 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf1), vcagtq_f16(vx1, vdenorm_cutoff)));
+ vf2 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf2), vcagtq_f16(vx2, vdenorm_cutoff)));
+ vf3 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf3), vcagtq_f16(vx3, vdenorm_cutoff)));
+
+ const uint16x8_t vm0 = vcltq_f16(vx0, vmovq_n_f16(0.0f));
+ const uint16x8_t vm1 = vcltq_f16(vx1, vmovq_n_f16(0.0f));
+ const uint16x8_t vm2 = vcltq_f16(vx2, vmovq_n_f16(0.0f));
+ const uint16x8_t vm3 = vcltq_f16(vx3, vmovq_n_f16(0.0f));
+
+ vf0 = vbslq_f16(vm0, vf0, vsubq_f16(vone, vf0));
+ vf1 = vbslq_f16(vm1, vf1, vsubq_f16(vone, vf1));
+ vf2 = vbslq_f16(vm2, vf2, vsubq_f16(vone, vf2));
+ vf3 = vbslq_f16(vm3, vf3, vsubq_f16(vone, vf3));
+
+ vst1q_f16(o, vf0); o += 8;
+ vst1q_f16(o, vf1); o += 8;
+ vst1q_f16(o, vf2); o += 8;
+ vst1q_f16(o, vf3); o += 8;
+ }
+ for (; batch >= 8 * sizeof(__fp16); batch -= 8 * sizeof(__fp16)) {
+ const float16x8_t vx = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ float16x8_t vf = vdivq_f16(ve, vd);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ vst1q_f16(o, vf); o += 8;
+ }
+ if XNN_UNLIKELY(batch != 0) {
+ const float16x8_t vx = vld1q_f16(i);
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ float16x8_t vf = vdivq_f16(ve, vd);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ float16x4_t vf_lo = vget_low_f16(vf);
+ if (batch & (4 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 4;
+ vf_lo = vget_high_f16(vf);
+ }
+ if (batch & (2 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 2;
+ vf_lo = vext_f16(vf_lo, vf_lo, 2);
+ }
+ if (batch & (1 * sizeof(__fp16))) {
+ vst1_lane_f16(o, vf_lo, 0);
+ }
+ }
+}
diff --git a/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x40.c b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x40.c
new file mode 100644
index 0000000..745ef30
--- /dev/null
+++ b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x40.c
@@ -0,0 +1,192 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-vsigmoid/neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x40(
+ size_t batch,
+ const void* input,
+ void* output,
+ const union xnn_f16_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(batch % sizeof(__fp16) == 0);
+
+ const float16x8_t vmagic_bias = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.magic_bias));
+ const float16x8_t vminus_log2e = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.minus_log2e));
+ const float16x8_t vln2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.ln2));
+ const float16x8_t vc3 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c3));
+ const float16x8_t vc2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c2));
+ const float16x8_t vone = vmovq_n_f16(1.0f);
+ const float16x8_t vdenorm_cutoff = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.denorm_cutoff));
+
+ const __fp16* i = (const __fp16*) input;
+ __fp16* o = (__fp16*) output;
+ for (; batch >= 40 * sizeof(__fp16); batch -= 40 * sizeof(__fp16)) {
+ const float16x8_t vx0 = vld1q_f16(i); i += 8;
+ const float16x8_t vx1 = vld1q_f16(i); i += 8;
+ const float16x8_t vx2 = vld1q_f16(i); i += 8;
+ const float16x8_t vx3 = vld1q_f16(i); i += 8;
+ const float16x8_t vx4 = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz0 = vabsq_f16(vx0);
+ const float16x8_t vz1 = vabsq_f16(vx1);
+ const float16x8_t vz2 = vabsq_f16(vx2);
+ const float16x8_t vz3 = vabsq_f16(vx3);
+ const float16x8_t vz4 = vabsq_f16(vx4);
+
+ float16x8_t vn0 = vfmaq_f16(vmagic_bias, vz0, vminus_log2e);
+ float16x8_t vn1 = vfmaq_f16(vmagic_bias, vz1, vminus_log2e);
+ float16x8_t vn2 = vfmaq_f16(vmagic_bias, vz2, vminus_log2e);
+ float16x8_t vn3 = vfmaq_f16(vmagic_bias, vz3, vminus_log2e);
+ float16x8_t vn4 = vfmaq_f16(vmagic_bias, vz4, vminus_log2e);
+
+ const float16x8_t vs0 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn0), 10));
+ const float16x8_t vs1 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn1), 10));
+ const float16x8_t vs2 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn2), 10));
+ const float16x8_t vs3 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn3), 10));
+ const float16x8_t vs4 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn4), 10));
+
+ vn0 = vsubq_f16(vn0, vmagic_bias);
+ vn1 = vsubq_f16(vn1, vmagic_bias);
+ vn2 = vsubq_f16(vn2, vmagic_bias);
+ vn3 = vsubq_f16(vn3, vmagic_bias);
+ vn4 = vsubq_f16(vn4, vmagic_bias);
+
+ float16x8_t vt0 = vfmaq_f16(vz0, vn0, vln2);
+ float16x8_t vt1 = vfmaq_f16(vz1, vn1, vln2);
+ float16x8_t vt2 = vfmaq_f16(vz2, vn2, vln2);
+ float16x8_t vt3 = vfmaq_f16(vz3, vn3, vln2);
+ float16x8_t vt4 = vfmaq_f16(vz4, vn4, vln2);
+
+ float16x8_t vp0 = vfmaq_f16(vc2, vc3, vt0);
+ float16x8_t vp1 = vfmaq_f16(vc2, vc3, vt1);
+ float16x8_t vp2 = vfmaq_f16(vc2, vc3, vt2);
+ float16x8_t vp3 = vfmaq_f16(vc2, vc3, vt3);
+ float16x8_t vp4 = vfmaq_f16(vc2, vc3, vt4);
+
+ vp0 = vfmsq_f16(vone, vp0, vt0);
+ vp1 = vfmsq_f16(vone, vp1, vt1);
+ vp2 = vfmsq_f16(vone, vp2, vt2);
+ vp3 = vfmsq_f16(vone, vp3, vt3);
+ vp4 = vfmsq_f16(vone, vp4, vt4);
+
+ vt0 = vmulq_f16(vt0, vs0);
+ vt1 = vmulq_f16(vt1, vs1);
+ vt2 = vmulq_f16(vt2, vs2);
+ vt3 = vmulq_f16(vt3, vs3);
+ vt4 = vmulq_f16(vt4, vs4);
+
+ const float16x8_t ve0 = vfmsq_f16(vs0, vp0, vt0);
+ const float16x8_t ve1 = vfmsq_f16(vs1, vp1, vt1);
+ const float16x8_t ve2 = vfmsq_f16(vs2, vp2, vt2);
+ const float16x8_t ve3 = vfmsq_f16(vs3, vp3, vt3);
+ const float16x8_t ve4 = vfmsq_f16(vs4, vp4, vt4);
+
+ const float16x8_t vd0 = vaddq_f16(ve0, vone);
+ const float16x8_t vd1 = vaddq_f16(ve1, vone);
+ const float16x8_t vd2 = vaddq_f16(ve2, vone);
+ const float16x8_t vd3 = vaddq_f16(ve3, vone);
+ const float16x8_t vd4 = vaddq_f16(ve4, vone);
+
+ float16x8_t vf0 = vdivq_f16(ve0, vd0);
+ float16x8_t vf1 = vdivq_f16(ve1, vd1);
+ float16x8_t vf2 = vdivq_f16(ve2, vd2);
+ float16x8_t vf3 = vdivq_f16(ve3, vd3);
+ float16x8_t vf4 = vdivq_f16(ve4, vd4);
+
+ vf0 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf0), vcagtq_f16(vx0, vdenorm_cutoff)));
+ vf1 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf1), vcagtq_f16(vx1, vdenorm_cutoff)));
+ vf2 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf2), vcagtq_f16(vx2, vdenorm_cutoff)));
+ vf3 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf3), vcagtq_f16(vx3, vdenorm_cutoff)));
+ vf4 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf4), vcagtq_f16(vx4, vdenorm_cutoff)));
+
+ const uint16x8_t vm0 = vcltq_f16(vx0, vmovq_n_f16(0.0f));
+ const uint16x8_t vm1 = vcltq_f16(vx1, vmovq_n_f16(0.0f));
+ const uint16x8_t vm2 = vcltq_f16(vx2, vmovq_n_f16(0.0f));
+ const uint16x8_t vm3 = vcltq_f16(vx3, vmovq_n_f16(0.0f));
+ const uint16x8_t vm4 = vcltq_f16(vx4, vmovq_n_f16(0.0f));
+
+ vf0 = vbslq_f16(vm0, vf0, vsubq_f16(vone, vf0));
+ vf1 = vbslq_f16(vm1, vf1, vsubq_f16(vone, vf1));
+ vf2 = vbslq_f16(vm2, vf2, vsubq_f16(vone, vf2));
+ vf3 = vbslq_f16(vm3, vf3, vsubq_f16(vone, vf3));
+ vf4 = vbslq_f16(vm4, vf4, vsubq_f16(vone, vf4));
+
+ vst1q_f16(o, vf0); o += 8;
+ vst1q_f16(o, vf1); o += 8;
+ vst1q_f16(o, vf2); o += 8;
+ vst1q_f16(o, vf3); o += 8;
+ vst1q_f16(o, vf4); o += 8;
+ }
+ for (; batch >= 8 * sizeof(__fp16); batch -= 8 * sizeof(__fp16)) {
+ const float16x8_t vx = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ float16x8_t vf = vdivq_f16(ve, vd);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ vst1q_f16(o, vf); o += 8;
+ }
+ if XNN_UNLIKELY(batch != 0) {
+ const float16x8_t vx = vld1q_f16(i);
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ float16x8_t vf = vdivq_f16(ve, vd);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ float16x4_t vf_lo = vget_low_f16(vf);
+ if (batch & (4 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 4;
+ vf_lo = vget_high_f16(vf);
+ }
+ if (batch & (2 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 2;
+ vf_lo = vext_f16(vf_lo, vf_lo, 2);
+ }
+ if (batch & (1 * sizeof(__fp16))) {
+ vst1_lane_f16(o, vf_lo, 0);
+ }
+ }
+}
diff --git a/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x48.c b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x48.c
new file mode 100644
index 0000000..f26d053
--- /dev/null
+++ b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x48.c
@@ -0,0 +1,208 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-vsigmoid/neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x48(
+ size_t batch,
+ const void* input,
+ void* output,
+ const union xnn_f16_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(batch % sizeof(__fp16) == 0);
+
+ const float16x8_t vmagic_bias = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.magic_bias));
+ const float16x8_t vminus_log2e = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.minus_log2e));
+ const float16x8_t vln2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.ln2));
+ const float16x8_t vc3 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c3));
+ const float16x8_t vc2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c2));
+ const float16x8_t vone = vmovq_n_f16(1.0f);
+ const float16x8_t vdenorm_cutoff = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.denorm_cutoff));
+
+ const __fp16* i = (const __fp16*) input;
+ __fp16* o = (__fp16*) output;
+ for (; batch >= 48 * sizeof(__fp16); batch -= 48 * sizeof(__fp16)) {
+ const float16x8_t vx0 = vld1q_f16(i); i += 8;
+ const float16x8_t vx1 = vld1q_f16(i); i += 8;
+ const float16x8_t vx2 = vld1q_f16(i); i += 8;
+ const float16x8_t vx3 = vld1q_f16(i); i += 8;
+ const float16x8_t vx4 = vld1q_f16(i); i += 8;
+ const float16x8_t vx5 = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz0 = vabsq_f16(vx0);
+ const float16x8_t vz1 = vabsq_f16(vx1);
+ const float16x8_t vz2 = vabsq_f16(vx2);
+ const float16x8_t vz3 = vabsq_f16(vx3);
+ const float16x8_t vz4 = vabsq_f16(vx4);
+ const float16x8_t vz5 = vabsq_f16(vx5);
+
+ float16x8_t vn0 = vfmaq_f16(vmagic_bias, vz0, vminus_log2e);
+ float16x8_t vn1 = vfmaq_f16(vmagic_bias, vz1, vminus_log2e);
+ float16x8_t vn2 = vfmaq_f16(vmagic_bias, vz2, vminus_log2e);
+ float16x8_t vn3 = vfmaq_f16(vmagic_bias, vz3, vminus_log2e);
+ float16x8_t vn4 = vfmaq_f16(vmagic_bias, vz4, vminus_log2e);
+ float16x8_t vn5 = vfmaq_f16(vmagic_bias, vz5, vminus_log2e);
+
+ const float16x8_t vs0 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn0), 10));
+ const float16x8_t vs1 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn1), 10));
+ const float16x8_t vs2 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn2), 10));
+ const float16x8_t vs3 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn3), 10));
+ const float16x8_t vs4 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn4), 10));
+ const float16x8_t vs5 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn5), 10));
+
+ vn0 = vsubq_f16(vn0, vmagic_bias);
+ vn1 = vsubq_f16(vn1, vmagic_bias);
+ vn2 = vsubq_f16(vn2, vmagic_bias);
+ vn3 = vsubq_f16(vn3, vmagic_bias);
+ vn4 = vsubq_f16(vn4, vmagic_bias);
+ vn5 = vsubq_f16(vn5, vmagic_bias);
+
+ float16x8_t vt0 = vfmaq_f16(vz0, vn0, vln2);
+ float16x8_t vt1 = vfmaq_f16(vz1, vn1, vln2);
+ float16x8_t vt2 = vfmaq_f16(vz2, vn2, vln2);
+ float16x8_t vt3 = vfmaq_f16(vz3, vn3, vln2);
+ float16x8_t vt4 = vfmaq_f16(vz4, vn4, vln2);
+ float16x8_t vt5 = vfmaq_f16(vz5, vn5, vln2);
+
+ float16x8_t vp0 = vfmaq_f16(vc2, vc3, vt0);
+ float16x8_t vp1 = vfmaq_f16(vc2, vc3, vt1);
+ float16x8_t vp2 = vfmaq_f16(vc2, vc3, vt2);
+ float16x8_t vp3 = vfmaq_f16(vc2, vc3, vt3);
+ float16x8_t vp4 = vfmaq_f16(vc2, vc3, vt4);
+ float16x8_t vp5 = vfmaq_f16(vc2, vc3, vt5);
+
+ vp0 = vfmsq_f16(vone, vp0, vt0);
+ vp1 = vfmsq_f16(vone, vp1, vt1);
+ vp2 = vfmsq_f16(vone, vp2, vt2);
+ vp3 = vfmsq_f16(vone, vp3, vt3);
+ vp4 = vfmsq_f16(vone, vp4, vt4);
+ vp5 = vfmsq_f16(vone, vp5, vt5);
+
+ vt0 = vmulq_f16(vt0, vs0);
+ vt1 = vmulq_f16(vt1, vs1);
+ vt2 = vmulq_f16(vt2, vs2);
+ vt3 = vmulq_f16(vt3, vs3);
+ vt4 = vmulq_f16(vt4, vs4);
+ vt5 = vmulq_f16(vt5, vs5);
+
+ const float16x8_t ve0 = vfmsq_f16(vs0, vp0, vt0);
+ const float16x8_t ve1 = vfmsq_f16(vs1, vp1, vt1);
+ const float16x8_t ve2 = vfmsq_f16(vs2, vp2, vt2);
+ const float16x8_t ve3 = vfmsq_f16(vs3, vp3, vt3);
+ const float16x8_t ve4 = vfmsq_f16(vs4, vp4, vt4);
+ const float16x8_t ve5 = vfmsq_f16(vs5, vp5, vt5);
+
+ const float16x8_t vd0 = vaddq_f16(ve0, vone);
+ const float16x8_t vd1 = vaddq_f16(ve1, vone);
+ const float16x8_t vd2 = vaddq_f16(ve2, vone);
+ const float16x8_t vd3 = vaddq_f16(ve3, vone);
+ const float16x8_t vd4 = vaddq_f16(ve4, vone);
+ const float16x8_t vd5 = vaddq_f16(ve5, vone);
+
+ float16x8_t vf0 = vdivq_f16(ve0, vd0);
+ float16x8_t vf1 = vdivq_f16(ve1, vd1);
+ float16x8_t vf2 = vdivq_f16(ve2, vd2);
+ float16x8_t vf3 = vdivq_f16(ve3, vd3);
+ float16x8_t vf4 = vdivq_f16(ve4, vd4);
+ float16x8_t vf5 = vdivq_f16(ve5, vd5);
+
+ vf0 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf0), vcagtq_f16(vx0, vdenorm_cutoff)));
+ vf1 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf1), vcagtq_f16(vx1, vdenorm_cutoff)));
+ vf2 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf2), vcagtq_f16(vx2, vdenorm_cutoff)));
+ vf3 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf3), vcagtq_f16(vx3, vdenorm_cutoff)));
+ vf4 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf4), vcagtq_f16(vx4, vdenorm_cutoff)));
+ vf5 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf5), vcagtq_f16(vx5, vdenorm_cutoff)));
+
+ const uint16x8_t vm0 = vcltq_f16(vx0, vmovq_n_f16(0.0f));
+ const uint16x8_t vm1 = vcltq_f16(vx1, vmovq_n_f16(0.0f));
+ const uint16x8_t vm2 = vcltq_f16(vx2, vmovq_n_f16(0.0f));
+ const uint16x8_t vm3 = vcltq_f16(vx3, vmovq_n_f16(0.0f));
+ const uint16x8_t vm4 = vcltq_f16(vx4, vmovq_n_f16(0.0f));
+ const uint16x8_t vm5 = vcltq_f16(vx5, vmovq_n_f16(0.0f));
+
+ vf0 = vbslq_f16(vm0, vf0, vsubq_f16(vone, vf0));
+ vf1 = vbslq_f16(vm1, vf1, vsubq_f16(vone, vf1));
+ vf2 = vbslq_f16(vm2, vf2, vsubq_f16(vone, vf2));
+ vf3 = vbslq_f16(vm3, vf3, vsubq_f16(vone, vf3));
+ vf4 = vbslq_f16(vm4, vf4, vsubq_f16(vone, vf4));
+ vf5 = vbslq_f16(vm5, vf5, vsubq_f16(vone, vf5));
+
+ vst1q_f16(o, vf0); o += 8;
+ vst1q_f16(o, vf1); o += 8;
+ vst1q_f16(o, vf2); o += 8;
+ vst1q_f16(o, vf3); o += 8;
+ vst1q_f16(o, vf4); o += 8;
+ vst1q_f16(o, vf5); o += 8;
+ }
+ for (; batch >= 8 * sizeof(__fp16); batch -= 8 * sizeof(__fp16)) {
+ const float16x8_t vx = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ float16x8_t vf = vdivq_f16(ve, vd);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ vst1q_f16(o, vf); o += 8;
+ }
+ if XNN_UNLIKELY(batch != 0) {
+ const float16x8_t vx = vld1q_f16(i);
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ float16x8_t vf = vdivq_f16(ve, vd);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ float16x4_t vf_lo = vget_low_f16(vf);
+ if (batch & (4 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 4;
+ vf_lo = vget_high_f16(vf);
+ }
+ if (batch & (2 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 2;
+ vf_lo = vext_f16(vf_lo, vf_lo, 2);
+ }
+ if (batch & (1 * sizeof(__fp16))) {
+ vst1_lane_f16(o, vf_lo, 0);
+ }
+ }
+}
diff --git a/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x56.c b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x56.c
new file mode 100644
index 0000000..743c87a
--- /dev/null
+++ b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x56.c
@@ -0,0 +1,224 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-vsigmoid/neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x56(
+ size_t batch,
+ const void* input,
+ void* output,
+ const union xnn_f16_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(batch % sizeof(__fp16) == 0);
+
+ const float16x8_t vmagic_bias = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.magic_bias));
+ const float16x8_t vminus_log2e = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.minus_log2e));
+ const float16x8_t vln2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.ln2));
+ const float16x8_t vc3 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c3));
+ const float16x8_t vc2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c2));
+ const float16x8_t vone = vmovq_n_f16(1.0f);
+ const float16x8_t vdenorm_cutoff = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.denorm_cutoff));
+
+ const __fp16* i = (const __fp16*) input;
+ __fp16* o = (__fp16*) output;
+ for (; batch >= 56 * sizeof(__fp16); batch -= 56 * sizeof(__fp16)) {
+ const float16x8_t vx0 = vld1q_f16(i); i += 8;
+ const float16x8_t vx1 = vld1q_f16(i); i += 8;
+ const float16x8_t vx2 = vld1q_f16(i); i += 8;
+ const float16x8_t vx3 = vld1q_f16(i); i += 8;
+ const float16x8_t vx4 = vld1q_f16(i); i += 8;
+ const float16x8_t vx5 = vld1q_f16(i); i += 8;
+ const float16x8_t vx6 = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz0 = vabsq_f16(vx0);
+ const float16x8_t vz1 = vabsq_f16(vx1);
+ const float16x8_t vz2 = vabsq_f16(vx2);
+ const float16x8_t vz3 = vabsq_f16(vx3);
+ const float16x8_t vz4 = vabsq_f16(vx4);
+ const float16x8_t vz5 = vabsq_f16(vx5);
+ const float16x8_t vz6 = vabsq_f16(vx6);
+
+ float16x8_t vn0 = vfmaq_f16(vmagic_bias, vz0, vminus_log2e);
+ float16x8_t vn1 = vfmaq_f16(vmagic_bias, vz1, vminus_log2e);
+ float16x8_t vn2 = vfmaq_f16(vmagic_bias, vz2, vminus_log2e);
+ float16x8_t vn3 = vfmaq_f16(vmagic_bias, vz3, vminus_log2e);
+ float16x8_t vn4 = vfmaq_f16(vmagic_bias, vz4, vminus_log2e);
+ float16x8_t vn5 = vfmaq_f16(vmagic_bias, vz5, vminus_log2e);
+ float16x8_t vn6 = vfmaq_f16(vmagic_bias, vz6, vminus_log2e);
+
+ const float16x8_t vs0 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn0), 10));
+ const float16x8_t vs1 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn1), 10));
+ const float16x8_t vs2 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn2), 10));
+ const float16x8_t vs3 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn3), 10));
+ const float16x8_t vs4 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn4), 10));
+ const float16x8_t vs5 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn5), 10));
+ const float16x8_t vs6 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn6), 10));
+
+ vn0 = vsubq_f16(vn0, vmagic_bias);
+ vn1 = vsubq_f16(vn1, vmagic_bias);
+ vn2 = vsubq_f16(vn2, vmagic_bias);
+ vn3 = vsubq_f16(vn3, vmagic_bias);
+ vn4 = vsubq_f16(vn4, vmagic_bias);
+ vn5 = vsubq_f16(vn5, vmagic_bias);
+ vn6 = vsubq_f16(vn6, vmagic_bias);
+
+ float16x8_t vt0 = vfmaq_f16(vz0, vn0, vln2);
+ float16x8_t vt1 = vfmaq_f16(vz1, vn1, vln2);
+ float16x8_t vt2 = vfmaq_f16(vz2, vn2, vln2);
+ float16x8_t vt3 = vfmaq_f16(vz3, vn3, vln2);
+ float16x8_t vt4 = vfmaq_f16(vz4, vn4, vln2);
+ float16x8_t vt5 = vfmaq_f16(vz5, vn5, vln2);
+ float16x8_t vt6 = vfmaq_f16(vz6, vn6, vln2);
+
+ float16x8_t vp0 = vfmaq_f16(vc2, vc3, vt0);
+ float16x8_t vp1 = vfmaq_f16(vc2, vc3, vt1);
+ float16x8_t vp2 = vfmaq_f16(vc2, vc3, vt2);
+ float16x8_t vp3 = vfmaq_f16(vc2, vc3, vt3);
+ float16x8_t vp4 = vfmaq_f16(vc2, vc3, vt4);
+ float16x8_t vp5 = vfmaq_f16(vc2, vc3, vt5);
+ float16x8_t vp6 = vfmaq_f16(vc2, vc3, vt6);
+
+ vp0 = vfmsq_f16(vone, vp0, vt0);
+ vp1 = vfmsq_f16(vone, vp1, vt1);
+ vp2 = vfmsq_f16(vone, vp2, vt2);
+ vp3 = vfmsq_f16(vone, vp3, vt3);
+ vp4 = vfmsq_f16(vone, vp4, vt4);
+ vp5 = vfmsq_f16(vone, vp5, vt5);
+ vp6 = vfmsq_f16(vone, vp6, vt6);
+
+ vt0 = vmulq_f16(vt0, vs0);
+ vt1 = vmulq_f16(vt1, vs1);
+ vt2 = vmulq_f16(vt2, vs2);
+ vt3 = vmulq_f16(vt3, vs3);
+ vt4 = vmulq_f16(vt4, vs4);
+ vt5 = vmulq_f16(vt5, vs5);
+ vt6 = vmulq_f16(vt6, vs6);
+
+ const float16x8_t ve0 = vfmsq_f16(vs0, vp0, vt0);
+ const float16x8_t ve1 = vfmsq_f16(vs1, vp1, vt1);
+ const float16x8_t ve2 = vfmsq_f16(vs2, vp2, vt2);
+ const float16x8_t ve3 = vfmsq_f16(vs3, vp3, vt3);
+ const float16x8_t ve4 = vfmsq_f16(vs4, vp4, vt4);
+ const float16x8_t ve5 = vfmsq_f16(vs5, vp5, vt5);
+ const float16x8_t ve6 = vfmsq_f16(vs6, vp6, vt6);
+
+ const float16x8_t vd0 = vaddq_f16(ve0, vone);
+ const float16x8_t vd1 = vaddq_f16(ve1, vone);
+ const float16x8_t vd2 = vaddq_f16(ve2, vone);
+ const float16x8_t vd3 = vaddq_f16(ve3, vone);
+ const float16x8_t vd4 = vaddq_f16(ve4, vone);
+ const float16x8_t vd5 = vaddq_f16(ve5, vone);
+ const float16x8_t vd6 = vaddq_f16(ve6, vone);
+
+ float16x8_t vf0 = vdivq_f16(ve0, vd0);
+ float16x8_t vf1 = vdivq_f16(ve1, vd1);
+ float16x8_t vf2 = vdivq_f16(ve2, vd2);
+ float16x8_t vf3 = vdivq_f16(ve3, vd3);
+ float16x8_t vf4 = vdivq_f16(ve4, vd4);
+ float16x8_t vf5 = vdivq_f16(ve5, vd5);
+ float16x8_t vf6 = vdivq_f16(ve6, vd6);
+
+ vf0 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf0), vcagtq_f16(vx0, vdenorm_cutoff)));
+ vf1 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf1), vcagtq_f16(vx1, vdenorm_cutoff)));
+ vf2 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf2), vcagtq_f16(vx2, vdenorm_cutoff)));
+ vf3 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf3), vcagtq_f16(vx3, vdenorm_cutoff)));
+ vf4 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf4), vcagtq_f16(vx4, vdenorm_cutoff)));
+ vf5 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf5), vcagtq_f16(vx5, vdenorm_cutoff)));
+ vf6 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf6), vcagtq_f16(vx6, vdenorm_cutoff)));
+
+ const uint16x8_t vm0 = vcltq_f16(vx0, vmovq_n_f16(0.0f));
+ const uint16x8_t vm1 = vcltq_f16(vx1, vmovq_n_f16(0.0f));
+ const uint16x8_t vm2 = vcltq_f16(vx2, vmovq_n_f16(0.0f));
+ const uint16x8_t vm3 = vcltq_f16(vx3, vmovq_n_f16(0.0f));
+ const uint16x8_t vm4 = vcltq_f16(vx4, vmovq_n_f16(0.0f));
+ const uint16x8_t vm5 = vcltq_f16(vx5, vmovq_n_f16(0.0f));
+ const uint16x8_t vm6 = vcltq_f16(vx6, vmovq_n_f16(0.0f));
+
+ vf0 = vbslq_f16(vm0, vf0, vsubq_f16(vone, vf0));
+ vf1 = vbslq_f16(vm1, vf1, vsubq_f16(vone, vf1));
+ vf2 = vbslq_f16(vm2, vf2, vsubq_f16(vone, vf2));
+ vf3 = vbslq_f16(vm3, vf3, vsubq_f16(vone, vf3));
+ vf4 = vbslq_f16(vm4, vf4, vsubq_f16(vone, vf4));
+ vf5 = vbslq_f16(vm5, vf5, vsubq_f16(vone, vf5));
+ vf6 = vbslq_f16(vm6, vf6, vsubq_f16(vone, vf6));
+
+ vst1q_f16(o, vf0); o += 8;
+ vst1q_f16(o, vf1); o += 8;
+ vst1q_f16(o, vf2); o += 8;
+ vst1q_f16(o, vf3); o += 8;
+ vst1q_f16(o, vf4); o += 8;
+ vst1q_f16(o, vf5); o += 8;
+ vst1q_f16(o, vf6); o += 8;
+ }
+ for (; batch >= 8 * sizeof(__fp16); batch -= 8 * sizeof(__fp16)) {
+ const float16x8_t vx = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ float16x8_t vf = vdivq_f16(ve, vd);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ vst1q_f16(o, vf); o += 8;
+ }
+ if XNN_UNLIKELY(batch != 0) {
+ const float16x8_t vx = vld1q_f16(i);
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ float16x8_t vf = vdivq_f16(ve, vd);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ float16x4_t vf_lo = vget_low_f16(vf);
+ if (batch & (4 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 4;
+ vf_lo = vget_high_f16(vf);
+ }
+ if (batch & (2 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 2;
+ vf_lo = vext_f16(vf_lo, vf_lo, 2);
+ }
+ if (batch & (1 * sizeof(__fp16))) {
+ vst1_lane_f16(o, vf_lo, 0);
+ }
+ }
+}
diff --git a/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x64.c b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x64.c
new file mode 100644
index 0000000..b886bb8
--- /dev/null
+++ b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x64.c
@@ -0,0 +1,240 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-vsigmoid/neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x64(
+ size_t batch,
+ const void* input,
+ void* output,
+ const union xnn_f16_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(batch % sizeof(__fp16) == 0);
+
+ const float16x8_t vmagic_bias = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.magic_bias));
+ const float16x8_t vminus_log2e = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.minus_log2e));
+ const float16x8_t vln2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.ln2));
+ const float16x8_t vc3 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c3));
+ const float16x8_t vc2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c2));
+ const float16x8_t vone = vmovq_n_f16(1.0f);
+ const float16x8_t vdenorm_cutoff = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.denorm_cutoff));
+
+ const __fp16* i = (const __fp16*) input;
+ __fp16* o = (__fp16*) output;
+ for (; batch >= 64 * sizeof(__fp16); batch -= 64 * sizeof(__fp16)) {
+ const float16x8_t vx0 = vld1q_f16(i); i += 8;
+ const float16x8_t vx1 = vld1q_f16(i); i += 8;
+ const float16x8_t vx2 = vld1q_f16(i); i += 8;
+ const float16x8_t vx3 = vld1q_f16(i); i += 8;
+ const float16x8_t vx4 = vld1q_f16(i); i += 8;
+ const float16x8_t vx5 = vld1q_f16(i); i += 8;
+ const float16x8_t vx6 = vld1q_f16(i); i += 8;
+ const float16x8_t vx7 = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz0 = vabsq_f16(vx0);
+ const float16x8_t vz1 = vabsq_f16(vx1);
+ const float16x8_t vz2 = vabsq_f16(vx2);
+ const float16x8_t vz3 = vabsq_f16(vx3);
+ const float16x8_t vz4 = vabsq_f16(vx4);
+ const float16x8_t vz5 = vabsq_f16(vx5);
+ const float16x8_t vz6 = vabsq_f16(vx6);
+ const float16x8_t vz7 = vabsq_f16(vx7);
+
+ float16x8_t vn0 = vfmaq_f16(vmagic_bias, vz0, vminus_log2e);
+ float16x8_t vn1 = vfmaq_f16(vmagic_bias, vz1, vminus_log2e);
+ float16x8_t vn2 = vfmaq_f16(vmagic_bias, vz2, vminus_log2e);
+ float16x8_t vn3 = vfmaq_f16(vmagic_bias, vz3, vminus_log2e);
+ float16x8_t vn4 = vfmaq_f16(vmagic_bias, vz4, vminus_log2e);
+ float16x8_t vn5 = vfmaq_f16(vmagic_bias, vz5, vminus_log2e);
+ float16x8_t vn6 = vfmaq_f16(vmagic_bias, vz6, vminus_log2e);
+ float16x8_t vn7 = vfmaq_f16(vmagic_bias, vz7, vminus_log2e);
+
+ const float16x8_t vs0 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn0), 10));
+ const float16x8_t vs1 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn1), 10));
+ const float16x8_t vs2 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn2), 10));
+ const float16x8_t vs3 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn3), 10));
+ const float16x8_t vs4 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn4), 10));
+ const float16x8_t vs5 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn5), 10));
+ const float16x8_t vs6 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn6), 10));
+ const float16x8_t vs7 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn7), 10));
+
+ vn0 = vsubq_f16(vn0, vmagic_bias);
+ vn1 = vsubq_f16(vn1, vmagic_bias);
+ vn2 = vsubq_f16(vn2, vmagic_bias);
+ vn3 = vsubq_f16(vn3, vmagic_bias);
+ vn4 = vsubq_f16(vn4, vmagic_bias);
+ vn5 = vsubq_f16(vn5, vmagic_bias);
+ vn6 = vsubq_f16(vn6, vmagic_bias);
+ vn7 = vsubq_f16(vn7, vmagic_bias);
+
+ float16x8_t vt0 = vfmaq_f16(vz0, vn0, vln2);
+ float16x8_t vt1 = vfmaq_f16(vz1, vn1, vln2);
+ float16x8_t vt2 = vfmaq_f16(vz2, vn2, vln2);
+ float16x8_t vt3 = vfmaq_f16(vz3, vn3, vln2);
+ float16x8_t vt4 = vfmaq_f16(vz4, vn4, vln2);
+ float16x8_t vt5 = vfmaq_f16(vz5, vn5, vln2);
+ float16x8_t vt6 = vfmaq_f16(vz6, vn6, vln2);
+ float16x8_t vt7 = vfmaq_f16(vz7, vn7, vln2);
+
+ float16x8_t vp0 = vfmaq_f16(vc2, vc3, vt0);
+ float16x8_t vp1 = vfmaq_f16(vc2, vc3, vt1);
+ float16x8_t vp2 = vfmaq_f16(vc2, vc3, vt2);
+ float16x8_t vp3 = vfmaq_f16(vc2, vc3, vt3);
+ float16x8_t vp4 = vfmaq_f16(vc2, vc3, vt4);
+ float16x8_t vp5 = vfmaq_f16(vc2, vc3, vt5);
+ float16x8_t vp6 = vfmaq_f16(vc2, vc3, vt6);
+ float16x8_t vp7 = vfmaq_f16(vc2, vc3, vt7);
+
+ vp0 = vfmsq_f16(vone, vp0, vt0);
+ vp1 = vfmsq_f16(vone, vp1, vt1);
+ vp2 = vfmsq_f16(vone, vp2, vt2);
+ vp3 = vfmsq_f16(vone, vp3, vt3);
+ vp4 = vfmsq_f16(vone, vp4, vt4);
+ vp5 = vfmsq_f16(vone, vp5, vt5);
+ vp6 = vfmsq_f16(vone, vp6, vt6);
+ vp7 = vfmsq_f16(vone, vp7, vt7);
+
+ vt0 = vmulq_f16(vt0, vs0);
+ vt1 = vmulq_f16(vt1, vs1);
+ vt2 = vmulq_f16(vt2, vs2);
+ vt3 = vmulq_f16(vt3, vs3);
+ vt4 = vmulq_f16(vt4, vs4);
+ vt5 = vmulq_f16(vt5, vs5);
+ vt6 = vmulq_f16(vt6, vs6);
+ vt7 = vmulq_f16(vt7, vs7);
+
+ const float16x8_t ve0 = vfmsq_f16(vs0, vp0, vt0);
+ const float16x8_t ve1 = vfmsq_f16(vs1, vp1, vt1);
+ const float16x8_t ve2 = vfmsq_f16(vs2, vp2, vt2);
+ const float16x8_t ve3 = vfmsq_f16(vs3, vp3, vt3);
+ const float16x8_t ve4 = vfmsq_f16(vs4, vp4, vt4);
+ const float16x8_t ve5 = vfmsq_f16(vs5, vp5, vt5);
+ const float16x8_t ve6 = vfmsq_f16(vs6, vp6, vt6);
+ const float16x8_t ve7 = vfmsq_f16(vs7, vp7, vt7);
+
+ const float16x8_t vd0 = vaddq_f16(ve0, vone);
+ const float16x8_t vd1 = vaddq_f16(ve1, vone);
+ const float16x8_t vd2 = vaddq_f16(ve2, vone);
+ const float16x8_t vd3 = vaddq_f16(ve3, vone);
+ const float16x8_t vd4 = vaddq_f16(ve4, vone);
+ const float16x8_t vd5 = vaddq_f16(ve5, vone);
+ const float16x8_t vd6 = vaddq_f16(ve6, vone);
+ const float16x8_t vd7 = vaddq_f16(ve7, vone);
+
+ float16x8_t vf0 = vdivq_f16(ve0, vd0);
+ float16x8_t vf1 = vdivq_f16(ve1, vd1);
+ float16x8_t vf2 = vdivq_f16(ve2, vd2);
+ float16x8_t vf3 = vdivq_f16(ve3, vd3);
+ float16x8_t vf4 = vdivq_f16(ve4, vd4);
+ float16x8_t vf5 = vdivq_f16(ve5, vd5);
+ float16x8_t vf6 = vdivq_f16(ve6, vd6);
+ float16x8_t vf7 = vdivq_f16(ve7, vd7);
+
+ vf0 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf0), vcagtq_f16(vx0, vdenorm_cutoff)));
+ vf1 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf1), vcagtq_f16(vx1, vdenorm_cutoff)));
+ vf2 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf2), vcagtq_f16(vx2, vdenorm_cutoff)));
+ vf3 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf3), vcagtq_f16(vx3, vdenorm_cutoff)));
+ vf4 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf4), vcagtq_f16(vx4, vdenorm_cutoff)));
+ vf5 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf5), vcagtq_f16(vx5, vdenorm_cutoff)));
+ vf6 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf6), vcagtq_f16(vx6, vdenorm_cutoff)));
+ vf7 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf7), vcagtq_f16(vx7, vdenorm_cutoff)));
+
+ const uint16x8_t vm0 = vcltq_f16(vx0, vmovq_n_f16(0.0f));
+ const uint16x8_t vm1 = vcltq_f16(vx1, vmovq_n_f16(0.0f));
+ const uint16x8_t vm2 = vcltq_f16(vx2, vmovq_n_f16(0.0f));
+ const uint16x8_t vm3 = vcltq_f16(vx3, vmovq_n_f16(0.0f));
+ const uint16x8_t vm4 = vcltq_f16(vx4, vmovq_n_f16(0.0f));
+ const uint16x8_t vm5 = vcltq_f16(vx5, vmovq_n_f16(0.0f));
+ const uint16x8_t vm6 = vcltq_f16(vx6, vmovq_n_f16(0.0f));
+ const uint16x8_t vm7 = vcltq_f16(vx7, vmovq_n_f16(0.0f));
+
+ vf0 = vbslq_f16(vm0, vf0, vsubq_f16(vone, vf0));
+ vf1 = vbslq_f16(vm1, vf1, vsubq_f16(vone, vf1));
+ vf2 = vbslq_f16(vm2, vf2, vsubq_f16(vone, vf2));
+ vf3 = vbslq_f16(vm3, vf3, vsubq_f16(vone, vf3));
+ vf4 = vbslq_f16(vm4, vf4, vsubq_f16(vone, vf4));
+ vf5 = vbslq_f16(vm5, vf5, vsubq_f16(vone, vf5));
+ vf6 = vbslq_f16(vm6, vf6, vsubq_f16(vone, vf6));
+ vf7 = vbslq_f16(vm7, vf7, vsubq_f16(vone, vf7));
+
+ vst1q_f16(o, vf0); o += 8;
+ vst1q_f16(o, vf1); o += 8;
+ vst1q_f16(o, vf2); o += 8;
+ vst1q_f16(o, vf3); o += 8;
+ vst1q_f16(o, vf4); o += 8;
+ vst1q_f16(o, vf5); o += 8;
+ vst1q_f16(o, vf6); o += 8;
+ vst1q_f16(o, vf7); o += 8;
+ }
+ for (; batch >= 8 * sizeof(__fp16); batch -= 8 * sizeof(__fp16)) {
+ const float16x8_t vx = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ float16x8_t vf = vdivq_f16(ve, vd);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ vst1q_f16(o, vf); o += 8;
+ }
+ if XNN_UNLIKELY(batch != 0) {
+ const float16x8_t vx = vld1q_f16(i);
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ float16x8_t vf = vdivq_f16(ve, vd);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ float16x4_t vf_lo = vget_low_f16(vf);
+ if (batch & (4 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 4;
+ vf_lo = vget_high_f16(vf);
+ }
+ if (batch & (2 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 2;
+ vf_lo = vext_f16(vf_lo, vf_lo, 2);
+ }
+ if (batch & (1 * sizeof(__fp16))) {
+ vst1_lane_f16(o, vf_lo, 0);
+ }
+ }
+}
diff --git a/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x8.c b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x8.c
new file mode 100644
index 0000000..5bb7e5d
--- /dev/null
+++ b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-div-x8.c
@@ -0,0 +1,95 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-vsigmoid/neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x8(
+ size_t batch,
+ const void* input,
+ void* output,
+ const union xnn_f16_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(batch % sizeof(__fp16) == 0);
+
+ const float16x8_t vmagic_bias = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.magic_bias));
+ const float16x8_t vminus_log2e = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.minus_log2e));
+ const float16x8_t vln2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.ln2));
+ const float16x8_t vc3 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c3));
+ const float16x8_t vc2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c2));
+ const float16x8_t vone = vmovq_n_f16(1.0f);
+ const float16x8_t vdenorm_cutoff = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.denorm_cutoff));
+
+ const __fp16* i = (const __fp16*) input;
+ __fp16* o = (__fp16*) output;
+ for (; batch >= 8 * sizeof(__fp16); batch -= 8 * sizeof(__fp16)) {
+ const float16x8_t vx = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ float16x8_t vf = vdivq_f16(ve, vd);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ vst1q_f16(o, vf); o += 8;
+ }
+ if XNN_UNLIKELY(batch != 0) {
+ const float16x8_t vx = vld1q_f16(i);
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ float16x8_t vf = vdivq_f16(ve, vd);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ float16x4_t vf_lo = vget_low_f16(vf);
+ if (batch & (4 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 4;
+ vf_lo = vget_high_f16(vf);
+ }
+ if (batch & (2 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 2;
+ vf_lo = vext_f16(vf_lo, vf_lo, 2);
+ }
+ if (batch & (1 * sizeof(__fp16))) {
+ vst1_lane_f16(o, vf_lo, 0);
+ }
+ }
+}
diff --git a/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x16.c b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x16.c
new file mode 100644
index 0000000..8c6a7ae
--- /dev/null
+++ b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x16.c
@@ -0,0 +1,149 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-vsigmoid/neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x16(
+ size_t batch,
+ const void* input,
+ void* output,
+ const union xnn_f16_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(batch % sizeof(__fp16) == 0);
+
+ const float16x8_t vmagic_bias = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.magic_bias));
+ const float16x8_t vminus_log2e = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.minus_log2e));
+ const float16x8_t vln2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.ln2));
+ const float16x8_t vc3 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c3));
+ const float16x8_t vc2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c2));
+ const float16x8_t vone = vmovq_n_f16(1.0f);
+ const float16x8_t vdenorm_cutoff = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.denorm_cutoff));
+
+ const __fp16* i = (const __fp16*) input;
+ __fp16* o = (__fp16*) output;
+ for (; batch >= 16 * sizeof(__fp16); batch -= 16 * sizeof(__fp16)) {
+ const float16x8_t vx0 = vld1q_f16(i); i += 8;
+ const float16x8_t vx1 = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz0 = vabsq_f16(vx0);
+ const float16x8_t vz1 = vabsq_f16(vx1);
+
+ float16x8_t vn0 = vfmaq_f16(vmagic_bias, vz0, vminus_log2e);
+ float16x8_t vn1 = vfmaq_f16(vmagic_bias, vz1, vminus_log2e);
+
+ const float16x8_t vs0 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn0), 10));
+ const float16x8_t vs1 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn1), 10));
+
+ vn0 = vsubq_f16(vn0, vmagic_bias);
+ vn1 = vsubq_f16(vn1, vmagic_bias);
+
+ float16x8_t vt0 = vfmaq_f16(vz0, vn0, vln2);
+ float16x8_t vt1 = vfmaq_f16(vz1, vn1, vln2);
+
+ float16x8_t vp0 = vfmaq_f16(vc2, vc3, vt0);
+ float16x8_t vp1 = vfmaq_f16(vc2, vc3, vt1);
+
+ vp0 = vfmsq_f16(vone, vp0, vt0);
+ vp1 = vfmsq_f16(vone, vp1, vt1);
+
+ vt0 = vmulq_f16(vt0, vs0);
+ vt1 = vmulq_f16(vt1, vs1);
+
+ const float16x8_t ve0 = vfmsq_f16(vs0, vp0, vt0);
+ const float16x8_t ve1 = vfmsq_f16(vs1, vp1, vt1);
+
+ const float16x8_t vd0 = vaddq_f16(ve0, vone);
+ const float16x8_t vd1 = vaddq_f16(ve1, vone);
+
+ float16x8_t vr0 = vrecpeq_f16(vd0);
+ float16x8_t vr1 = vrecpeq_f16(vd1);
+
+ float16x8_t vf0 = vmulq_f16(ve0, vr0);
+ float16x8_t vf1 = vmulq_f16(ve1, vr1);
+
+ vf0 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf0), vcagtq_f16(vx0, vdenorm_cutoff)));
+ vf1 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf1), vcagtq_f16(vx1, vdenorm_cutoff)));
+
+ const uint16x8_t vm0 = vcltq_f16(vx0, vmovq_n_f16(0.0f));
+ const uint16x8_t vm1 = vcltq_f16(vx1, vmovq_n_f16(0.0f));
+
+ vf0 = vbslq_f16(vm0, vf0, vsubq_f16(vone, vf0));
+ vf1 = vbslq_f16(vm1, vf1, vsubq_f16(vone, vf1));
+
+ vst1q_f16(o, vf0); o += 8;
+ vst1q_f16(o, vf1); o += 8;
+ }
+ for (; batch >= 8 * sizeof(__fp16); batch -= 8 * sizeof(__fp16)) {
+ const float16x8_t vx = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ const float16x8_t vr = vrecpeq_f16(vd);
+ float16x8_t vf = vmulq_f16(ve, vr);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ vst1q_f16(o, vf); o += 8;
+ }
+ if XNN_UNLIKELY(batch != 0) {
+ const float16x8_t vx = vld1q_f16(i);
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ const float16x8_t vr = vrecpeq_f16(vd);
+ float16x8_t vf = vmulq_f16(ve, vr);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ float16x4_t vf_lo = vget_low_f16(vf);
+ if (batch & (4 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 4;
+ vf_lo = vget_high_f16(vf);
+ }
+ if (batch & (2 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 2;
+ vf_lo = vext_f16(vf_lo, vf_lo, 2);
+ }
+ if (batch & (1 * sizeof(__fp16))) {
+ vst1_lane_f16(o, vf_lo, 0);
+ }
+ }
+}
diff --git a/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x24.c b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x24.c
new file mode 100644
index 0000000..9798845
--- /dev/null
+++ b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x24.c
@@ -0,0 +1,166 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-vsigmoid/neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x24(
+ size_t batch,
+ const void* input,
+ void* output,
+ const union xnn_f16_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(batch % sizeof(__fp16) == 0);
+
+ const float16x8_t vmagic_bias = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.magic_bias));
+ const float16x8_t vminus_log2e = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.minus_log2e));
+ const float16x8_t vln2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.ln2));
+ const float16x8_t vc3 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c3));
+ const float16x8_t vc2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c2));
+ const float16x8_t vone = vmovq_n_f16(1.0f);
+ const float16x8_t vdenorm_cutoff = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.denorm_cutoff));
+
+ const __fp16* i = (const __fp16*) input;
+ __fp16* o = (__fp16*) output;
+ for (; batch >= 24 * sizeof(__fp16); batch -= 24 * sizeof(__fp16)) {
+ const float16x8_t vx0 = vld1q_f16(i); i += 8;
+ const float16x8_t vx1 = vld1q_f16(i); i += 8;
+ const float16x8_t vx2 = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz0 = vabsq_f16(vx0);
+ const float16x8_t vz1 = vabsq_f16(vx1);
+ const float16x8_t vz2 = vabsq_f16(vx2);
+
+ float16x8_t vn0 = vfmaq_f16(vmagic_bias, vz0, vminus_log2e);
+ float16x8_t vn1 = vfmaq_f16(vmagic_bias, vz1, vminus_log2e);
+ float16x8_t vn2 = vfmaq_f16(vmagic_bias, vz2, vminus_log2e);
+
+ const float16x8_t vs0 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn0), 10));
+ const float16x8_t vs1 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn1), 10));
+ const float16x8_t vs2 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn2), 10));
+
+ vn0 = vsubq_f16(vn0, vmagic_bias);
+ vn1 = vsubq_f16(vn1, vmagic_bias);
+ vn2 = vsubq_f16(vn2, vmagic_bias);
+
+ float16x8_t vt0 = vfmaq_f16(vz0, vn0, vln2);
+ float16x8_t vt1 = vfmaq_f16(vz1, vn1, vln2);
+ float16x8_t vt2 = vfmaq_f16(vz2, vn2, vln2);
+
+ float16x8_t vp0 = vfmaq_f16(vc2, vc3, vt0);
+ float16x8_t vp1 = vfmaq_f16(vc2, vc3, vt1);
+ float16x8_t vp2 = vfmaq_f16(vc2, vc3, vt2);
+
+ vp0 = vfmsq_f16(vone, vp0, vt0);
+ vp1 = vfmsq_f16(vone, vp1, vt1);
+ vp2 = vfmsq_f16(vone, vp2, vt2);
+
+ vt0 = vmulq_f16(vt0, vs0);
+ vt1 = vmulq_f16(vt1, vs1);
+ vt2 = vmulq_f16(vt2, vs2);
+
+ const float16x8_t ve0 = vfmsq_f16(vs0, vp0, vt0);
+ const float16x8_t ve1 = vfmsq_f16(vs1, vp1, vt1);
+ const float16x8_t ve2 = vfmsq_f16(vs2, vp2, vt2);
+
+ const float16x8_t vd0 = vaddq_f16(ve0, vone);
+ const float16x8_t vd1 = vaddq_f16(ve1, vone);
+ const float16x8_t vd2 = vaddq_f16(ve2, vone);
+
+ float16x8_t vr0 = vrecpeq_f16(vd0);
+ float16x8_t vr1 = vrecpeq_f16(vd1);
+ float16x8_t vr2 = vrecpeq_f16(vd2);
+
+ float16x8_t vf0 = vmulq_f16(ve0, vr0);
+ float16x8_t vf1 = vmulq_f16(ve1, vr1);
+ float16x8_t vf2 = vmulq_f16(ve2, vr2);
+
+ vf0 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf0), vcagtq_f16(vx0, vdenorm_cutoff)));
+ vf1 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf1), vcagtq_f16(vx1, vdenorm_cutoff)));
+ vf2 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf2), vcagtq_f16(vx2, vdenorm_cutoff)));
+
+ const uint16x8_t vm0 = vcltq_f16(vx0, vmovq_n_f16(0.0f));
+ const uint16x8_t vm1 = vcltq_f16(vx1, vmovq_n_f16(0.0f));
+ const uint16x8_t vm2 = vcltq_f16(vx2, vmovq_n_f16(0.0f));
+
+ vf0 = vbslq_f16(vm0, vf0, vsubq_f16(vone, vf0));
+ vf1 = vbslq_f16(vm1, vf1, vsubq_f16(vone, vf1));
+ vf2 = vbslq_f16(vm2, vf2, vsubq_f16(vone, vf2));
+
+ vst1q_f16(o, vf0); o += 8;
+ vst1q_f16(o, vf1); o += 8;
+ vst1q_f16(o, vf2); o += 8;
+ }
+ for (; batch >= 8 * sizeof(__fp16); batch -= 8 * sizeof(__fp16)) {
+ const float16x8_t vx = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ const float16x8_t vr = vrecpeq_f16(vd);
+ float16x8_t vf = vmulq_f16(ve, vr);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ vst1q_f16(o, vf); o += 8;
+ }
+ if XNN_UNLIKELY(batch != 0) {
+ const float16x8_t vx = vld1q_f16(i);
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ const float16x8_t vr = vrecpeq_f16(vd);
+ float16x8_t vf = vmulq_f16(ve, vr);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ float16x4_t vf_lo = vget_low_f16(vf);
+ if (batch & (4 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 4;
+ vf_lo = vget_high_f16(vf);
+ }
+ if (batch & (2 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 2;
+ vf_lo = vext_f16(vf_lo, vf_lo, 2);
+ }
+ if (batch & (1 * sizeof(__fp16))) {
+ vst1_lane_f16(o, vf_lo, 0);
+ }
+ }
+}
diff --git a/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x32.c b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x32.c
new file mode 100644
index 0000000..8d20cb0
--- /dev/null
+++ b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x32.c
@@ -0,0 +1,183 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-vsigmoid/neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x32(
+ size_t batch,
+ const void* input,
+ void* output,
+ const union xnn_f16_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(batch % sizeof(__fp16) == 0);
+
+ const float16x8_t vmagic_bias = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.magic_bias));
+ const float16x8_t vminus_log2e = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.minus_log2e));
+ const float16x8_t vln2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.ln2));
+ const float16x8_t vc3 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c3));
+ const float16x8_t vc2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c2));
+ const float16x8_t vone = vmovq_n_f16(1.0f);
+ const float16x8_t vdenorm_cutoff = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.denorm_cutoff));
+
+ const __fp16* i = (const __fp16*) input;
+ __fp16* o = (__fp16*) output;
+ for (; batch >= 32 * sizeof(__fp16); batch -= 32 * sizeof(__fp16)) {
+ const float16x8_t vx0 = vld1q_f16(i); i += 8;
+ const float16x8_t vx1 = vld1q_f16(i); i += 8;
+ const float16x8_t vx2 = vld1q_f16(i); i += 8;
+ const float16x8_t vx3 = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz0 = vabsq_f16(vx0);
+ const float16x8_t vz1 = vabsq_f16(vx1);
+ const float16x8_t vz2 = vabsq_f16(vx2);
+ const float16x8_t vz3 = vabsq_f16(vx3);
+
+ float16x8_t vn0 = vfmaq_f16(vmagic_bias, vz0, vminus_log2e);
+ float16x8_t vn1 = vfmaq_f16(vmagic_bias, vz1, vminus_log2e);
+ float16x8_t vn2 = vfmaq_f16(vmagic_bias, vz2, vminus_log2e);
+ float16x8_t vn3 = vfmaq_f16(vmagic_bias, vz3, vminus_log2e);
+
+ const float16x8_t vs0 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn0), 10));
+ const float16x8_t vs1 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn1), 10));
+ const float16x8_t vs2 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn2), 10));
+ const float16x8_t vs3 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn3), 10));
+
+ vn0 = vsubq_f16(vn0, vmagic_bias);
+ vn1 = vsubq_f16(vn1, vmagic_bias);
+ vn2 = vsubq_f16(vn2, vmagic_bias);
+ vn3 = vsubq_f16(vn3, vmagic_bias);
+
+ float16x8_t vt0 = vfmaq_f16(vz0, vn0, vln2);
+ float16x8_t vt1 = vfmaq_f16(vz1, vn1, vln2);
+ float16x8_t vt2 = vfmaq_f16(vz2, vn2, vln2);
+ float16x8_t vt3 = vfmaq_f16(vz3, vn3, vln2);
+
+ float16x8_t vp0 = vfmaq_f16(vc2, vc3, vt0);
+ float16x8_t vp1 = vfmaq_f16(vc2, vc3, vt1);
+ float16x8_t vp2 = vfmaq_f16(vc2, vc3, vt2);
+ float16x8_t vp3 = vfmaq_f16(vc2, vc3, vt3);
+
+ vp0 = vfmsq_f16(vone, vp0, vt0);
+ vp1 = vfmsq_f16(vone, vp1, vt1);
+ vp2 = vfmsq_f16(vone, vp2, vt2);
+ vp3 = vfmsq_f16(vone, vp3, vt3);
+
+ vt0 = vmulq_f16(vt0, vs0);
+ vt1 = vmulq_f16(vt1, vs1);
+ vt2 = vmulq_f16(vt2, vs2);
+ vt3 = vmulq_f16(vt3, vs3);
+
+ const float16x8_t ve0 = vfmsq_f16(vs0, vp0, vt0);
+ const float16x8_t ve1 = vfmsq_f16(vs1, vp1, vt1);
+ const float16x8_t ve2 = vfmsq_f16(vs2, vp2, vt2);
+ const float16x8_t ve3 = vfmsq_f16(vs3, vp3, vt3);
+
+ const float16x8_t vd0 = vaddq_f16(ve0, vone);
+ const float16x8_t vd1 = vaddq_f16(ve1, vone);
+ const float16x8_t vd2 = vaddq_f16(ve2, vone);
+ const float16x8_t vd3 = vaddq_f16(ve3, vone);
+
+ float16x8_t vr0 = vrecpeq_f16(vd0);
+ float16x8_t vr1 = vrecpeq_f16(vd1);
+ float16x8_t vr2 = vrecpeq_f16(vd2);
+ float16x8_t vr3 = vrecpeq_f16(vd3);
+
+ float16x8_t vf0 = vmulq_f16(ve0, vr0);
+ float16x8_t vf1 = vmulq_f16(ve1, vr1);
+ float16x8_t vf2 = vmulq_f16(ve2, vr2);
+ float16x8_t vf3 = vmulq_f16(ve3, vr3);
+
+ vf0 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf0), vcagtq_f16(vx0, vdenorm_cutoff)));
+ vf1 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf1), vcagtq_f16(vx1, vdenorm_cutoff)));
+ vf2 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf2), vcagtq_f16(vx2, vdenorm_cutoff)));
+ vf3 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf3), vcagtq_f16(vx3, vdenorm_cutoff)));
+
+ const uint16x8_t vm0 = vcltq_f16(vx0, vmovq_n_f16(0.0f));
+ const uint16x8_t vm1 = vcltq_f16(vx1, vmovq_n_f16(0.0f));
+ const uint16x8_t vm2 = vcltq_f16(vx2, vmovq_n_f16(0.0f));
+ const uint16x8_t vm3 = vcltq_f16(vx3, vmovq_n_f16(0.0f));
+
+ vf0 = vbslq_f16(vm0, vf0, vsubq_f16(vone, vf0));
+ vf1 = vbslq_f16(vm1, vf1, vsubq_f16(vone, vf1));
+ vf2 = vbslq_f16(vm2, vf2, vsubq_f16(vone, vf2));
+ vf3 = vbslq_f16(vm3, vf3, vsubq_f16(vone, vf3));
+
+ vst1q_f16(o, vf0); o += 8;
+ vst1q_f16(o, vf1); o += 8;
+ vst1q_f16(o, vf2); o += 8;
+ vst1q_f16(o, vf3); o += 8;
+ }
+ for (; batch >= 8 * sizeof(__fp16); batch -= 8 * sizeof(__fp16)) {
+ const float16x8_t vx = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ const float16x8_t vr = vrecpeq_f16(vd);
+ float16x8_t vf = vmulq_f16(ve, vr);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ vst1q_f16(o, vf); o += 8;
+ }
+ if XNN_UNLIKELY(batch != 0) {
+ const float16x8_t vx = vld1q_f16(i);
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ const float16x8_t vr = vrecpeq_f16(vd);
+ float16x8_t vf = vmulq_f16(ve, vr);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ float16x4_t vf_lo = vget_low_f16(vf);
+ if (batch & (4 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 4;
+ vf_lo = vget_high_f16(vf);
+ }
+ if (batch & (2 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 2;
+ vf_lo = vext_f16(vf_lo, vf_lo, 2);
+ }
+ if (batch & (1 * sizeof(__fp16))) {
+ vst1_lane_f16(o, vf_lo, 0);
+ }
+ }
+}
diff --git a/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x40.c b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x40.c
new file mode 100644
index 0000000..a3f46f9
--- /dev/null
+++ b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x40.c
@@ -0,0 +1,200 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-vsigmoid/neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x40(
+ size_t batch,
+ const void* input,
+ void* output,
+ const union xnn_f16_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(batch % sizeof(__fp16) == 0);
+
+ const float16x8_t vmagic_bias = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.magic_bias));
+ const float16x8_t vminus_log2e = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.minus_log2e));
+ const float16x8_t vln2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.ln2));
+ const float16x8_t vc3 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c3));
+ const float16x8_t vc2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c2));
+ const float16x8_t vone = vmovq_n_f16(1.0f);
+ const float16x8_t vdenorm_cutoff = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.denorm_cutoff));
+
+ const __fp16* i = (const __fp16*) input;
+ __fp16* o = (__fp16*) output;
+ for (; batch >= 40 * sizeof(__fp16); batch -= 40 * sizeof(__fp16)) {
+ const float16x8_t vx0 = vld1q_f16(i); i += 8;
+ const float16x8_t vx1 = vld1q_f16(i); i += 8;
+ const float16x8_t vx2 = vld1q_f16(i); i += 8;
+ const float16x8_t vx3 = vld1q_f16(i); i += 8;
+ const float16x8_t vx4 = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz0 = vabsq_f16(vx0);
+ const float16x8_t vz1 = vabsq_f16(vx1);
+ const float16x8_t vz2 = vabsq_f16(vx2);
+ const float16x8_t vz3 = vabsq_f16(vx3);
+ const float16x8_t vz4 = vabsq_f16(vx4);
+
+ float16x8_t vn0 = vfmaq_f16(vmagic_bias, vz0, vminus_log2e);
+ float16x8_t vn1 = vfmaq_f16(vmagic_bias, vz1, vminus_log2e);
+ float16x8_t vn2 = vfmaq_f16(vmagic_bias, vz2, vminus_log2e);
+ float16x8_t vn3 = vfmaq_f16(vmagic_bias, vz3, vminus_log2e);
+ float16x8_t vn4 = vfmaq_f16(vmagic_bias, vz4, vminus_log2e);
+
+ const float16x8_t vs0 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn0), 10));
+ const float16x8_t vs1 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn1), 10));
+ const float16x8_t vs2 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn2), 10));
+ const float16x8_t vs3 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn3), 10));
+ const float16x8_t vs4 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn4), 10));
+
+ vn0 = vsubq_f16(vn0, vmagic_bias);
+ vn1 = vsubq_f16(vn1, vmagic_bias);
+ vn2 = vsubq_f16(vn2, vmagic_bias);
+ vn3 = vsubq_f16(vn3, vmagic_bias);
+ vn4 = vsubq_f16(vn4, vmagic_bias);
+
+ float16x8_t vt0 = vfmaq_f16(vz0, vn0, vln2);
+ float16x8_t vt1 = vfmaq_f16(vz1, vn1, vln2);
+ float16x8_t vt2 = vfmaq_f16(vz2, vn2, vln2);
+ float16x8_t vt3 = vfmaq_f16(vz3, vn3, vln2);
+ float16x8_t vt4 = vfmaq_f16(vz4, vn4, vln2);
+
+ float16x8_t vp0 = vfmaq_f16(vc2, vc3, vt0);
+ float16x8_t vp1 = vfmaq_f16(vc2, vc3, vt1);
+ float16x8_t vp2 = vfmaq_f16(vc2, vc3, vt2);
+ float16x8_t vp3 = vfmaq_f16(vc2, vc3, vt3);
+ float16x8_t vp4 = vfmaq_f16(vc2, vc3, vt4);
+
+ vp0 = vfmsq_f16(vone, vp0, vt0);
+ vp1 = vfmsq_f16(vone, vp1, vt1);
+ vp2 = vfmsq_f16(vone, vp2, vt2);
+ vp3 = vfmsq_f16(vone, vp3, vt3);
+ vp4 = vfmsq_f16(vone, vp4, vt4);
+
+ vt0 = vmulq_f16(vt0, vs0);
+ vt1 = vmulq_f16(vt1, vs1);
+ vt2 = vmulq_f16(vt2, vs2);
+ vt3 = vmulq_f16(vt3, vs3);
+ vt4 = vmulq_f16(vt4, vs4);
+
+ const float16x8_t ve0 = vfmsq_f16(vs0, vp0, vt0);
+ const float16x8_t ve1 = vfmsq_f16(vs1, vp1, vt1);
+ const float16x8_t ve2 = vfmsq_f16(vs2, vp2, vt2);
+ const float16x8_t ve3 = vfmsq_f16(vs3, vp3, vt3);
+ const float16x8_t ve4 = vfmsq_f16(vs4, vp4, vt4);
+
+ const float16x8_t vd0 = vaddq_f16(ve0, vone);
+ const float16x8_t vd1 = vaddq_f16(ve1, vone);
+ const float16x8_t vd2 = vaddq_f16(ve2, vone);
+ const float16x8_t vd3 = vaddq_f16(ve3, vone);
+ const float16x8_t vd4 = vaddq_f16(ve4, vone);
+
+ float16x8_t vr0 = vrecpeq_f16(vd0);
+ float16x8_t vr1 = vrecpeq_f16(vd1);
+ float16x8_t vr2 = vrecpeq_f16(vd2);
+ float16x8_t vr3 = vrecpeq_f16(vd3);
+ float16x8_t vr4 = vrecpeq_f16(vd4);
+
+ float16x8_t vf0 = vmulq_f16(ve0, vr0);
+ float16x8_t vf1 = vmulq_f16(ve1, vr1);
+ float16x8_t vf2 = vmulq_f16(ve2, vr2);
+ float16x8_t vf3 = vmulq_f16(ve3, vr3);
+ float16x8_t vf4 = vmulq_f16(ve4, vr4);
+
+ vf0 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf0), vcagtq_f16(vx0, vdenorm_cutoff)));
+ vf1 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf1), vcagtq_f16(vx1, vdenorm_cutoff)));
+ vf2 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf2), vcagtq_f16(vx2, vdenorm_cutoff)));
+ vf3 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf3), vcagtq_f16(vx3, vdenorm_cutoff)));
+ vf4 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf4), vcagtq_f16(vx4, vdenorm_cutoff)));
+
+ const uint16x8_t vm0 = vcltq_f16(vx0, vmovq_n_f16(0.0f));
+ const uint16x8_t vm1 = vcltq_f16(vx1, vmovq_n_f16(0.0f));
+ const uint16x8_t vm2 = vcltq_f16(vx2, vmovq_n_f16(0.0f));
+ const uint16x8_t vm3 = vcltq_f16(vx3, vmovq_n_f16(0.0f));
+ const uint16x8_t vm4 = vcltq_f16(vx4, vmovq_n_f16(0.0f));
+
+ vf0 = vbslq_f16(vm0, vf0, vsubq_f16(vone, vf0));
+ vf1 = vbslq_f16(vm1, vf1, vsubq_f16(vone, vf1));
+ vf2 = vbslq_f16(vm2, vf2, vsubq_f16(vone, vf2));
+ vf3 = vbslq_f16(vm3, vf3, vsubq_f16(vone, vf3));
+ vf4 = vbslq_f16(vm4, vf4, vsubq_f16(vone, vf4));
+
+ vst1q_f16(o, vf0); o += 8;
+ vst1q_f16(o, vf1); o += 8;
+ vst1q_f16(o, vf2); o += 8;
+ vst1q_f16(o, vf3); o += 8;
+ vst1q_f16(o, vf4); o += 8;
+ }
+ for (; batch >= 8 * sizeof(__fp16); batch -= 8 * sizeof(__fp16)) {
+ const float16x8_t vx = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ const float16x8_t vr = vrecpeq_f16(vd);
+ float16x8_t vf = vmulq_f16(ve, vr);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ vst1q_f16(o, vf); o += 8;
+ }
+ if XNN_UNLIKELY(batch != 0) {
+ const float16x8_t vx = vld1q_f16(i);
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ const float16x8_t vr = vrecpeq_f16(vd);
+ float16x8_t vf = vmulq_f16(ve, vr);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ float16x4_t vf_lo = vget_low_f16(vf);
+ if (batch & (4 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 4;
+ vf_lo = vget_high_f16(vf);
+ }
+ if (batch & (2 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 2;
+ vf_lo = vext_f16(vf_lo, vf_lo, 2);
+ }
+ if (batch & (1 * sizeof(__fp16))) {
+ vst1_lane_f16(o, vf_lo, 0);
+ }
+ }
+}
diff --git a/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x48.c b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x48.c
new file mode 100644
index 0000000..af4133e
--- /dev/null
+++ b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x48.c
@@ -0,0 +1,217 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-vsigmoid/neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x48(
+ size_t batch,
+ const void* input,
+ void* output,
+ const union xnn_f16_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(batch % sizeof(__fp16) == 0);
+
+ const float16x8_t vmagic_bias = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.magic_bias));
+ const float16x8_t vminus_log2e = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.minus_log2e));
+ const float16x8_t vln2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.ln2));
+ const float16x8_t vc3 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c3));
+ const float16x8_t vc2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c2));
+ const float16x8_t vone = vmovq_n_f16(1.0f);
+ const float16x8_t vdenorm_cutoff = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.denorm_cutoff));
+
+ const __fp16* i = (const __fp16*) input;
+ __fp16* o = (__fp16*) output;
+ for (; batch >= 48 * sizeof(__fp16); batch -= 48 * sizeof(__fp16)) {
+ const float16x8_t vx0 = vld1q_f16(i); i += 8;
+ const float16x8_t vx1 = vld1q_f16(i); i += 8;
+ const float16x8_t vx2 = vld1q_f16(i); i += 8;
+ const float16x8_t vx3 = vld1q_f16(i); i += 8;
+ const float16x8_t vx4 = vld1q_f16(i); i += 8;
+ const float16x8_t vx5 = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz0 = vabsq_f16(vx0);
+ const float16x8_t vz1 = vabsq_f16(vx1);
+ const float16x8_t vz2 = vabsq_f16(vx2);
+ const float16x8_t vz3 = vabsq_f16(vx3);
+ const float16x8_t vz4 = vabsq_f16(vx4);
+ const float16x8_t vz5 = vabsq_f16(vx5);
+
+ float16x8_t vn0 = vfmaq_f16(vmagic_bias, vz0, vminus_log2e);
+ float16x8_t vn1 = vfmaq_f16(vmagic_bias, vz1, vminus_log2e);
+ float16x8_t vn2 = vfmaq_f16(vmagic_bias, vz2, vminus_log2e);
+ float16x8_t vn3 = vfmaq_f16(vmagic_bias, vz3, vminus_log2e);
+ float16x8_t vn4 = vfmaq_f16(vmagic_bias, vz4, vminus_log2e);
+ float16x8_t vn5 = vfmaq_f16(vmagic_bias, vz5, vminus_log2e);
+
+ const float16x8_t vs0 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn0), 10));
+ const float16x8_t vs1 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn1), 10));
+ const float16x8_t vs2 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn2), 10));
+ const float16x8_t vs3 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn3), 10));
+ const float16x8_t vs4 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn4), 10));
+ const float16x8_t vs5 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn5), 10));
+
+ vn0 = vsubq_f16(vn0, vmagic_bias);
+ vn1 = vsubq_f16(vn1, vmagic_bias);
+ vn2 = vsubq_f16(vn2, vmagic_bias);
+ vn3 = vsubq_f16(vn3, vmagic_bias);
+ vn4 = vsubq_f16(vn4, vmagic_bias);
+ vn5 = vsubq_f16(vn5, vmagic_bias);
+
+ float16x8_t vt0 = vfmaq_f16(vz0, vn0, vln2);
+ float16x8_t vt1 = vfmaq_f16(vz1, vn1, vln2);
+ float16x8_t vt2 = vfmaq_f16(vz2, vn2, vln2);
+ float16x8_t vt3 = vfmaq_f16(vz3, vn3, vln2);
+ float16x8_t vt4 = vfmaq_f16(vz4, vn4, vln2);
+ float16x8_t vt5 = vfmaq_f16(vz5, vn5, vln2);
+
+ float16x8_t vp0 = vfmaq_f16(vc2, vc3, vt0);
+ float16x8_t vp1 = vfmaq_f16(vc2, vc3, vt1);
+ float16x8_t vp2 = vfmaq_f16(vc2, vc3, vt2);
+ float16x8_t vp3 = vfmaq_f16(vc2, vc3, vt3);
+ float16x8_t vp4 = vfmaq_f16(vc2, vc3, vt4);
+ float16x8_t vp5 = vfmaq_f16(vc2, vc3, vt5);
+
+ vp0 = vfmsq_f16(vone, vp0, vt0);
+ vp1 = vfmsq_f16(vone, vp1, vt1);
+ vp2 = vfmsq_f16(vone, vp2, vt2);
+ vp3 = vfmsq_f16(vone, vp3, vt3);
+ vp4 = vfmsq_f16(vone, vp4, vt4);
+ vp5 = vfmsq_f16(vone, vp5, vt5);
+
+ vt0 = vmulq_f16(vt0, vs0);
+ vt1 = vmulq_f16(vt1, vs1);
+ vt2 = vmulq_f16(vt2, vs2);
+ vt3 = vmulq_f16(vt3, vs3);
+ vt4 = vmulq_f16(vt4, vs4);
+ vt5 = vmulq_f16(vt5, vs5);
+
+ const float16x8_t ve0 = vfmsq_f16(vs0, vp0, vt0);
+ const float16x8_t ve1 = vfmsq_f16(vs1, vp1, vt1);
+ const float16x8_t ve2 = vfmsq_f16(vs2, vp2, vt2);
+ const float16x8_t ve3 = vfmsq_f16(vs3, vp3, vt3);
+ const float16x8_t ve4 = vfmsq_f16(vs4, vp4, vt4);
+ const float16x8_t ve5 = vfmsq_f16(vs5, vp5, vt5);
+
+ const float16x8_t vd0 = vaddq_f16(ve0, vone);
+ const float16x8_t vd1 = vaddq_f16(ve1, vone);
+ const float16x8_t vd2 = vaddq_f16(ve2, vone);
+ const float16x8_t vd3 = vaddq_f16(ve3, vone);
+ const float16x8_t vd4 = vaddq_f16(ve4, vone);
+ const float16x8_t vd5 = vaddq_f16(ve5, vone);
+
+ float16x8_t vr0 = vrecpeq_f16(vd0);
+ float16x8_t vr1 = vrecpeq_f16(vd1);
+ float16x8_t vr2 = vrecpeq_f16(vd2);
+ float16x8_t vr3 = vrecpeq_f16(vd3);
+ float16x8_t vr4 = vrecpeq_f16(vd4);
+ float16x8_t vr5 = vrecpeq_f16(vd5);
+
+ float16x8_t vf0 = vmulq_f16(ve0, vr0);
+ float16x8_t vf1 = vmulq_f16(ve1, vr1);
+ float16x8_t vf2 = vmulq_f16(ve2, vr2);
+ float16x8_t vf3 = vmulq_f16(ve3, vr3);
+ float16x8_t vf4 = vmulq_f16(ve4, vr4);
+ float16x8_t vf5 = vmulq_f16(ve5, vr5);
+
+ vf0 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf0), vcagtq_f16(vx0, vdenorm_cutoff)));
+ vf1 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf1), vcagtq_f16(vx1, vdenorm_cutoff)));
+ vf2 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf2), vcagtq_f16(vx2, vdenorm_cutoff)));
+ vf3 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf3), vcagtq_f16(vx3, vdenorm_cutoff)));
+ vf4 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf4), vcagtq_f16(vx4, vdenorm_cutoff)));
+ vf5 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf5), vcagtq_f16(vx5, vdenorm_cutoff)));
+
+ const uint16x8_t vm0 = vcltq_f16(vx0, vmovq_n_f16(0.0f));
+ const uint16x8_t vm1 = vcltq_f16(vx1, vmovq_n_f16(0.0f));
+ const uint16x8_t vm2 = vcltq_f16(vx2, vmovq_n_f16(0.0f));
+ const uint16x8_t vm3 = vcltq_f16(vx3, vmovq_n_f16(0.0f));
+ const uint16x8_t vm4 = vcltq_f16(vx4, vmovq_n_f16(0.0f));
+ const uint16x8_t vm5 = vcltq_f16(vx5, vmovq_n_f16(0.0f));
+
+ vf0 = vbslq_f16(vm0, vf0, vsubq_f16(vone, vf0));
+ vf1 = vbslq_f16(vm1, vf1, vsubq_f16(vone, vf1));
+ vf2 = vbslq_f16(vm2, vf2, vsubq_f16(vone, vf2));
+ vf3 = vbslq_f16(vm3, vf3, vsubq_f16(vone, vf3));
+ vf4 = vbslq_f16(vm4, vf4, vsubq_f16(vone, vf4));
+ vf5 = vbslq_f16(vm5, vf5, vsubq_f16(vone, vf5));
+
+ vst1q_f16(o, vf0); o += 8;
+ vst1q_f16(o, vf1); o += 8;
+ vst1q_f16(o, vf2); o += 8;
+ vst1q_f16(o, vf3); o += 8;
+ vst1q_f16(o, vf4); o += 8;
+ vst1q_f16(o, vf5); o += 8;
+ }
+ for (; batch >= 8 * sizeof(__fp16); batch -= 8 * sizeof(__fp16)) {
+ const float16x8_t vx = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ const float16x8_t vr = vrecpeq_f16(vd);
+ float16x8_t vf = vmulq_f16(ve, vr);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ vst1q_f16(o, vf); o += 8;
+ }
+ if XNN_UNLIKELY(batch != 0) {
+ const float16x8_t vx = vld1q_f16(i);
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ const float16x8_t vr = vrecpeq_f16(vd);
+ float16x8_t vf = vmulq_f16(ve, vr);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ float16x4_t vf_lo = vget_low_f16(vf);
+ if (batch & (4 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 4;
+ vf_lo = vget_high_f16(vf);
+ }
+ if (batch & (2 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 2;
+ vf_lo = vext_f16(vf_lo, vf_lo, 2);
+ }
+ if (batch & (1 * sizeof(__fp16))) {
+ vst1_lane_f16(o, vf_lo, 0);
+ }
+ }
+}
diff --git a/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x56.c b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x56.c
new file mode 100644
index 0000000..f752b62
--- /dev/null
+++ b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x56.c
@@ -0,0 +1,234 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-vsigmoid/neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x56(
+ size_t batch,
+ const void* input,
+ void* output,
+ const union xnn_f16_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(batch % sizeof(__fp16) == 0);
+
+ const float16x8_t vmagic_bias = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.magic_bias));
+ const float16x8_t vminus_log2e = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.minus_log2e));
+ const float16x8_t vln2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.ln2));
+ const float16x8_t vc3 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c3));
+ const float16x8_t vc2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c2));
+ const float16x8_t vone = vmovq_n_f16(1.0f);
+ const float16x8_t vdenorm_cutoff = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.denorm_cutoff));
+
+ const __fp16* i = (const __fp16*) input;
+ __fp16* o = (__fp16*) output;
+ for (; batch >= 56 * sizeof(__fp16); batch -= 56 * sizeof(__fp16)) {
+ const float16x8_t vx0 = vld1q_f16(i); i += 8;
+ const float16x8_t vx1 = vld1q_f16(i); i += 8;
+ const float16x8_t vx2 = vld1q_f16(i); i += 8;
+ const float16x8_t vx3 = vld1q_f16(i); i += 8;
+ const float16x8_t vx4 = vld1q_f16(i); i += 8;
+ const float16x8_t vx5 = vld1q_f16(i); i += 8;
+ const float16x8_t vx6 = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz0 = vabsq_f16(vx0);
+ const float16x8_t vz1 = vabsq_f16(vx1);
+ const float16x8_t vz2 = vabsq_f16(vx2);
+ const float16x8_t vz3 = vabsq_f16(vx3);
+ const float16x8_t vz4 = vabsq_f16(vx4);
+ const float16x8_t vz5 = vabsq_f16(vx5);
+ const float16x8_t vz6 = vabsq_f16(vx6);
+
+ float16x8_t vn0 = vfmaq_f16(vmagic_bias, vz0, vminus_log2e);
+ float16x8_t vn1 = vfmaq_f16(vmagic_bias, vz1, vminus_log2e);
+ float16x8_t vn2 = vfmaq_f16(vmagic_bias, vz2, vminus_log2e);
+ float16x8_t vn3 = vfmaq_f16(vmagic_bias, vz3, vminus_log2e);
+ float16x8_t vn4 = vfmaq_f16(vmagic_bias, vz4, vminus_log2e);
+ float16x8_t vn5 = vfmaq_f16(vmagic_bias, vz5, vminus_log2e);
+ float16x8_t vn6 = vfmaq_f16(vmagic_bias, vz6, vminus_log2e);
+
+ const float16x8_t vs0 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn0), 10));
+ const float16x8_t vs1 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn1), 10));
+ const float16x8_t vs2 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn2), 10));
+ const float16x8_t vs3 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn3), 10));
+ const float16x8_t vs4 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn4), 10));
+ const float16x8_t vs5 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn5), 10));
+ const float16x8_t vs6 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn6), 10));
+
+ vn0 = vsubq_f16(vn0, vmagic_bias);
+ vn1 = vsubq_f16(vn1, vmagic_bias);
+ vn2 = vsubq_f16(vn2, vmagic_bias);
+ vn3 = vsubq_f16(vn3, vmagic_bias);
+ vn4 = vsubq_f16(vn4, vmagic_bias);
+ vn5 = vsubq_f16(vn5, vmagic_bias);
+ vn6 = vsubq_f16(vn6, vmagic_bias);
+
+ float16x8_t vt0 = vfmaq_f16(vz0, vn0, vln2);
+ float16x8_t vt1 = vfmaq_f16(vz1, vn1, vln2);
+ float16x8_t vt2 = vfmaq_f16(vz2, vn2, vln2);
+ float16x8_t vt3 = vfmaq_f16(vz3, vn3, vln2);
+ float16x8_t vt4 = vfmaq_f16(vz4, vn4, vln2);
+ float16x8_t vt5 = vfmaq_f16(vz5, vn5, vln2);
+ float16x8_t vt6 = vfmaq_f16(vz6, vn6, vln2);
+
+ float16x8_t vp0 = vfmaq_f16(vc2, vc3, vt0);
+ float16x8_t vp1 = vfmaq_f16(vc2, vc3, vt1);
+ float16x8_t vp2 = vfmaq_f16(vc2, vc3, vt2);
+ float16x8_t vp3 = vfmaq_f16(vc2, vc3, vt3);
+ float16x8_t vp4 = vfmaq_f16(vc2, vc3, vt4);
+ float16x8_t vp5 = vfmaq_f16(vc2, vc3, vt5);
+ float16x8_t vp6 = vfmaq_f16(vc2, vc3, vt6);
+
+ vp0 = vfmsq_f16(vone, vp0, vt0);
+ vp1 = vfmsq_f16(vone, vp1, vt1);
+ vp2 = vfmsq_f16(vone, vp2, vt2);
+ vp3 = vfmsq_f16(vone, vp3, vt3);
+ vp4 = vfmsq_f16(vone, vp4, vt4);
+ vp5 = vfmsq_f16(vone, vp5, vt5);
+ vp6 = vfmsq_f16(vone, vp6, vt6);
+
+ vt0 = vmulq_f16(vt0, vs0);
+ vt1 = vmulq_f16(vt1, vs1);
+ vt2 = vmulq_f16(vt2, vs2);
+ vt3 = vmulq_f16(vt3, vs3);
+ vt4 = vmulq_f16(vt4, vs4);
+ vt5 = vmulq_f16(vt5, vs5);
+ vt6 = vmulq_f16(vt6, vs6);
+
+ const float16x8_t ve0 = vfmsq_f16(vs0, vp0, vt0);
+ const float16x8_t ve1 = vfmsq_f16(vs1, vp1, vt1);
+ const float16x8_t ve2 = vfmsq_f16(vs2, vp2, vt2);
+ const float16x8_t ve3 = vfmsq_f16(vs3, vp3, vt3);
+ const float16x8_t ve4 = vfmsq_f16(vs4, vp4, vt4);
+ const float16x8_t ve5 = vfmsq_f16(vs5, vp5, vt5);
+ const float16x8_t ve6 = vfmsq_f16(vs6, vp6, vt6);
+
+ const float16x8_t vd0 = vaddq_f16(ve0, vone);
+ const float16x8_t vd1 = vaddq_f16(ve1, vone);
+ const float16x8_t vd2 = vaddq_f16(ve2, vone);
+ const float16x8_t vd3 = vaddq_f16(ve3, vone);
+ const float16x8_t vd4 = vaddq_f16(ve4, vone);
+ const float16x8_t vd5 = vaddq_f16(ve5, vone);
+ const float16x8_t vd6 = vaddq_f16(ve6, vone);
+
+ float16x8_t vr0 = vrecpeq_f16(vd0);
+ float16x8_t vr1 = vrecpeq_f16(vd1);
+ float16x8_t vr2 = vrecpeq_f16(vd2);
+ float16x8_t vr3 = vrecpeq_f16(vd3);
+ float16x8_t vr4 = vrecpeq_f16(vd4);
+ float16x8_t vr5 = vrecpeq_f16(vd5);
+ float16x8_t vr6 = vrecpeq_f16(vd6);
+
+ float16x8_t vf0 = vmulq_f16(ve0, vr0);
+ float16x8_t vf1 = vmulq_f16(ve1, vr1);
+ float16x8_t vf2 = vmulq_f16(ve2, vr2);
+ float16x8_t vf3 = vmulq_f16(ve3, vr3);
+ float16x8_t vf4 = vmulq_f16(ve4, vr4);
+ float16x8_t vf5 = vmulq_f16(ve5, vr5);
+ float16x8_t vf6 = vmulq_f16(ve6, vr6);
+
+ vf0 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf0), vcagtq_f16(vx0, vdenorm_cutoff)));
+ vf1 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf1), vcagtq_f16(vx1, vdenorm_cutoff)));
+ vf2 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf2), vcagtq_f16(vx2, vdenorm_cutoff)));
+ vf3 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf3), vcagtq_f16(vx3, vdenorm_cutoff)));
+ vf4 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf4), vcagtq_f16(vx4, vdenorm_cutoff)));
+ vf5 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf5), vcagtq_f16(vx5, vdenorm_cutoff)));
+ vf6 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf6), vcagtq_f16(vx6, vdenorm_cutoff)));
+
+ const uint16x8_t vm0 = vcltq_f16(vx0, vmovq_n_f16(0.0f));
+ const uint16x8_t vm1 = vcltq_f16(vx1, vmovq_n_f16(0.0f));
+ const uint16x8_t vm2 = vcltq_f16(vx2, vmovq_n_f16(0.0f));
+ const uint16x8_t vm3 = vcltq_f16(vx3, vmovq_n_f16(0.0f));
+ const uint16x8_t vm4 = vcltq_f16(vx4, vmovq_n_f16(0.0f));
+ const uint16x8_t vm5 = vcltq_f16(vx5, vmovq_n_f16(0.0f));
+ const uint16x8_t vm6 = vcltq_f16(vx6, vmovq_n_f16(0.0f));
+
+ vf0 = vbslq_f16(vm0, vf0, vsubq_f16(vone, vf0));
+ vf1 = vbslq_f16(vm1, vf1, vsubq_f16(vone, vf1));
+ vf2 = vbslq_f16(vm2, vf2, vsubq_f16(vone, vf2));
+ vf3 = vbslq_f16(vm3, vf3, vsubq_f16(vone, vf3));
+ vf4 = vbslq_f16(vm4, vf4, vsubq_f16(vone, vf4));
+ vf5 = vbslq_f16(vm5, vf5, vsubq_f16(vone, vf5));
+ vf6 = vbslq_f16(vm6, vf6, vsubq_f16(vone, vf6));
+
+ vst1q_f16(o, vf0); o += 8;
+ vst1q_f16(o, vf1); o += 8;
+ vst1q_f16(o, vf2); o += 8;
+ vst1q_f16(o, vf3); o += 8;
+ vst1q_f16(o, vf4); o += 8;
+ vst1q_f16(o, vf5); o += 8;
+ vst1q_f16(o, vf6); o += 8;
+ }
+ for (; batch >= 8 * sizeof(__fp16); batch -= 8 * sizeof(__fp16)) {
+ const float16x8_t vx = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ const float16x8_t vr = vrecpeq_f16(vd);
+ float16x8_t vf = vmulq_f16(ve, vr);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ vst1q_f16(o, vf); o += 8;
+ }
+ if XNN_UNLIKELY(batch != 0) {
+ const float16x8_t vx = vld1q_f16(i);
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ const float16x8_t vr = vrecpeq_f16(vd);
+ float16x8_t vf = vmulq_f16(ve, vr);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ float16x4_t vf_lo = vget_low_f16(vf);
+ if (batch & (4 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 4;
+ vf_lo = vget_high_f16(vf);
+ }
+ if (batch & (2 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 2;
+ vf_lo = vext_f16(vf_lo, vf_lo, 2);
+ }
+ if (batch & (1 * sizeof(__fp16))) {
+ vst1_lane_f16(o, vf_lo, 0);
+ }
+ }
+}
diff --git a/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x64.c b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x64.c
new file mode 100644
index 0000000..942b999
--- /dev/null
+++ b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x64.c
@@ -0,0 +1,251 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-vsigmoid/neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x64(
+ size_t batch,
+ const void* input,
+ void* output,
+ const union xnn_f16_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(batch % sizeof(__fp16) == 0);
+
+ const float16x8_t vmagic_bias = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.magic_bias));
+ const float16x8_t vminus_log2e = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.minus_log2e));
+ const float16x8_t vln2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.ln2));
+ const float16x8_t vc3 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c3));
+ const float16x8_t vc2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c2));
+ const float16x8_t vone = vmovq_n_f16(1.0f);
+ const float16x8_t vdenorm_cutoff = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.denorm_cutoff));
+
+ const __fp16* i = (const __fp16*) input;
+ __fp16* o = (__fp16*) output;
+ for (; batch >= 64 * sizeof(__fp16); batch -= 64 * sizeof(__fp16)) {
+ const float16x8_t vx0 = vld1q_f16(i); i += 8;
+ const float16x8_t vx1 = vld1q_f16(i); i += 8;
+ const float16x8_t vx2 = vld1q_f16(i); i += 8;
+ const float16x8_t vx3 = vld1q_f16(i); i += 8;
+ const float16x8_t vx4 = vld1q_f16(i); i += 8;
+ const float16x8_t vx5 = vld1q_f16(i); i += 8;
+ const float16x8_t vx6 = vld1q_f16(i); i += 8;
+ const float16x8_t vx7 = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz0 = vabsq_f16(vx0);
+ const float16x8_t vz1 = vabsq_f16(vx1);
+ const float16x8_t vz2 = vabsq_f16(vx2);
+ const float16x8_t vz3 = vabsq_f16(vx3);
+ const float16x8_t vz4 = vabsq_f16(vx4);
+ const float16x8_t vz5 = vabsq_f16(vx5);
+ const float16x8_t vz6 = vabsq_f16(vx6);
+ const float16x8_t vz7 = vabsq_f16(vx7);
+
+ float16x8_t vn0 = vfmaq_f16(vmagic_bias, vz0, vminus_log2e);
+ float16x8_t vn1 = vfmaq_f16(vmagic_bias, vz1, vminus_log2e);
+ float16x8_t vn2 = vfmaq_f16(vmagic_bias, vz2, vminus_log2e);
+ float16x8_t vn3 = vfmaq_f16(vmagic_bias, vz3, vminus_log2e);
+ float16x8_t vn4 = vfmaq_f16(vmagic_bias, vz4, vminus_log2e);
+ float16x8_t vn5 = vfmaq_f16(vmagic_bias, vz5, vminus_log2e);
+ float16x8_t vn6 = vfmaq_f16(vmagic_bias, vz6, vminus_log2e);
+ float16x8_t vn7 = vfmaq_f16(vmagic_bias, vz7, vminus_log2e);
+
+ const float16x8_t vs0 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn0), 10));
+ const float16x8_t vs1 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn1), 10));
+ const float16x8_t vs2 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn2), 10));
+ const float16x8_t vs3 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn3), 10));
+ const float16x8_t vs4 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn4), 10));
+ const float16x8_t vs5 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn5), 10));
+ const float16x8_t vs6 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn6), 10));
+ const float16x8_t vs7 = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn7), 10));
+
+ vn0 = vsubq_f16(vn0, vmagic_bias);
+ vn1 = vsubq_f16(vn1, vmagic_bias);
+ vn2 = vsubq_f16(vn2, vmagic_bias);
+ vn3 = vsubq_f16(vn3, vmagic_bias);
+ vn4 = vsubq_f16(vn4, vmagic_bias);
+ vn5 = vsubq_f16(vn5, vmagic_bias);
+ vn6 = vsubq_f16(vn6, vmagic_bias);
+ vn7 = vsubq_f16(vn7, vmagic_bias);
+
+ float16x8_t vt0 = vfmaq_f16(vz0, vn0, vln2);
+ float16x8_t vt1 = vfmaq_f16(vz1, vn1, vln2);
+ float16x8_t vt2 = vfmaq_f16(vz2, vn2, vln2);
+ float16x8_t vt3 = vfmaq_f16(vz3, vn3, vln2);
+ float16x8_t vt4 = vfmaq_f16(vz4, vn4, vln2);
+ float16x8_t vt5 = vfmaq_f16(vz5, vn5, vln2);
+ float16x8_t vt6 = vfmaq_f16(vz6, vn6, vln2);
+ float16x8_t vt7 = vfmaq_f16(vz7, vn7, vln2);
+
+ float16x8_t vp0 = vfmaq_f16(vc2, vc3, vt0);
+ float16x8_t vp1 = vfmaq_f16(vc2, vc3, vt1);
+ float16x8_t vp2 = vfmaq_f16(vc2, vc3, vt2);
+ float16x8_t vp3 = vfmaq_f16(vc2, vc3, vt3);
+ float16x8_t vp4 = vfmaq_f16(vc2, vc3, vt4);
+ float16x8_t vp5 = vfmaq_f16(vc2, vc3, vt5);
+ float16x8_t vp6 = vfmaq_f16(vc2, vc3, vt6);
+ float16x8_t vp7 = vfmaq_f16(vc2, vc3, vt7);
+
+ vp0 = vfmsq_f16(vone, vp0, vt0);
+ vp1 = vfmsq_f16(vone, vp1, vt1);
+ vp2 = vfmsq_f16(vone, vp2, vt2);
+ vp3 = vfmsq_f16(vone, vp3, vt3);
+ vp4 = vfmsq_f16(vone, vp4, vt4);
+ vp5 = vfmsq_f16(vone, vp5, vt5);
+ vp6 = vfmsq_f16(vone, vp6, vt6);
+ vp7 = vfmsq_f16(vone, vp7, vt7);
+
+ vt0 = vmulq_f16(vt0, vs0);
+ vt1 = vmulq_f16(vt1, vs1);
+ vt2 = vmulq_f16(vt2, vs2);
+ vt3 = vmulq_f16(vt3, vs3);
+ vt4 = vmulq_f16(vt4, vs4);
+ vt5 = vmulq_f16(vt5, vs5);
+ vt6 = vmulq_f16(vt6, vs6);
+ vt7 = vmulq_f16(vt7, vs7);
+
+ const float16x8_t ve0 = vfmsq_f16(vs0, vp0, vt0);
+ const float16x8_t ve1 = vfmsq_f16(vs1, vp1, vt1);
+ const float16x8_t ve2 = vfmsq_f16(vs2, vp2, vt2);
+ const float16x8_t ve3 = vfmsq_f16(vs3, vp3, vt3);
+ const float16x8_t ve4 = vfmsq_f16(vs4, vp4, vt4);
+ const float16x8_t ve5 = vfmsq_f16(vs5, vp5, vt5);
+ const float16x8_t ve6 = vfmsq_f16(vs6, vp6, vt6);
+ const float16x8_t ve7 = vfmsq_f16(vs7, vp7, vt7);
+
+ const float16x8_t vd0 = vaddq_f16(ve0, vone);
+ const float16x8_t vd1 = vaddq_f16(ve1, vone);
+ const float16x8_t vd2 = vaddq_f16(ve2, vone);
+ const float16x8_t vd3 = vaddq_f16(ve3, vone);
+ const float16x8_t vd4 = vaddq_f16(ve4, vone);
+ const float16x8_t vd5 = vaddq_f16(ve5, vone);
+ const float16x8_t vd6 = vaddq_f16(ve6, vone);
+ const float16x8_t vd7 = vaddq_f16(ve7, vone);
+
+ float16x8_t vr0 = vrecpeq_f16(vd0);
+ float16x8_t vr1 = vrecpeq_f16(vd1);
+ float16x8_t vr2 = vrecpeq_f16(vd2);
+ float16x8_t vr3 = vrecpeq_f16(vd3);
+ float16x8_t vr4 = vrecpeq_f16(vd4);
+ float16x8_t vr5 = vrecpeq_f16(vd5);
+ float16x8_t vr6 = vrecpeq_f16(vd6);
+ float16x8_t vr7 = vrecpeq_f16(vd7);
+
+ float16x8_t vf0 = vmulq_f16(ve0, vr0);
+ float16x8_t vf1 = vmulq_f16(ve1, vr1);
+ float16x8_t vf2 = vmulq_f16(ve2, vr2);
+ float16x8_t vf3 = vmulq_f16(ve3, vr3);
+ float16x8_t vf4 = vmulq_f16(ve4, vr4);
+ float16x8_t vf5 = vmulq_f16(ve5, vr5);
+ float16x8_t vf6 = vmulq_f16(ve6, vr6);
+ float16x8_t vf7 = vmulq_f16(ve7, vr7);
+
+ vf0 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf0), vcagtq_f16(vx0, vdenorm_cutoff)));
+ vf1 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf1), vcagtq_f16(vx1, vdenorm_cutoff)));
+ vf2 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf2), vcagtq_f16(vx2, vdenorm_cutoff)));
+ vf3 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf3), vcagtq_f16(vx3, vdenorm_cutoff)));
+ vf4 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf4), vcagtq_f16(vx4, vdenorm_cutoff)));
+ vf5 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf5), vcagtq_f16(vx5, vdenorm_cutoff)));
+ vf6 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf6), vcagtq_f16(vx6, vdenorm_cutoff)));
+ vf7 = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf7), vcagtq_f16(vx7, vdenorm_cutoff)));
+
+ const uint16x8_t vm0 = vcltq_f16(vx0, vmovq_n_f16(0.0f));
+ const uint16x8_t vm1 = vcltq_f16(vx1, vmovq_n_f16(0.0f));
+ const uint16x8_t vm2 = vcltq_f16(vx2, vmovq_n_f16(0.0f));
+ const uint16x8_t vm3 = vcltq_f16(vx3, vmovq_n_f16(0.0f));
+ const uint16x8_t vm4 = vcltq_f16(vx4, vmovq_n_f16(0.0f));
+ const uint16x8_t vm5 = vcltq_f16(vx5, vmovq_n_f16(0.0f));
+ const uint16x8_t vm6 = vcltq_f16(vx6, vmovq_n_f16(0.0f));
+ const uint16x8_t vm7 = vcltq_f16(vx7, vmovq_n_f16(0.0f));
+
+ vf0 = vbslq_f16(vm0, vf0, vsubq_f16(vone, vf0));
+ vf1 = vbslq_f16(vm1, vf1, vsubq_f16(vone, vf1));
+ vf2 = vbslq_f16(vm2, vf2, vsubq_f16(vone, vf2));
+ vf3 = vbslq_f16(vm3, vf3, vsubq_f16(vone, vf3));
+ vf4 = vbslq_f16(vm4, vf4, vsubq_f16(vone, vf4));
+ vf5 = vbslq_f16(vm5, vf5, vsubq_f16(vone, vf5));
+ vf6 = vbslq_f16(vm6, vf6, vsubq_f16(vone, vf6));
+ vf7 = vbslq_f16(vm7, vf7, vsubq_f16(vone, vf7));
+
+ vst1q_f16(o, vf0); o += 8;
+ vst1q_f16(o, vf1); o += 8;
+ vst1q_f16(o, vf2); o += 8;
+ vst1q_f16(o, vf3); o += 8;
+ vst1q_f16(o, vf4); o += 8;
+ vst1q_f16(o, vf5); o += 8;
+ vst1q_f16(o, vf6); o += 8;
+ vst1q_f16(o, vf7); o += 8;
+ }
+ for (; batch >= 8 * sizeof(__fp16); batch -= 8 * sizeof(__fp16)) {
+ const float16x8_t vx = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ const float16x8_t vr = vrecpeq_f16(vd);
+ float16x8_t vf = vmulq_f16(ve, vr);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ vst1q_f16(o, vf); o += 8;
+ }
+ if XNN_UNLIKELY(batch != 0) {
+ const float16x8_t vx = vld1q_f16(i);
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ const float16x8_t vr = vrecpeq_f16(vd);
+ float16x8_t vf = vmulq_f16(ve, vr);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ float16x4_t vf_lo = vget_low_f16(vf);
+ if (batch & (4 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 4;
+ vf_lo = vget_high_f16(vf);
+ }
+ if (batch & (2 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 2;
+ vf_lo = vext_f16(vf_lo, vf_lo, 2);
+ }
+ if (batch & (1 * sizeof(__fp16))) {
+ vst1_lane_f16(o, vf_lo, 0);
+ }
+ }
+}
diff --git a/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x8.c b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x8.c
new file mode 100644
index 0000000..4271621
--- /dev/null
+++ b/src/f16-vsigmoid/gen/vsigmoid-neonfp16arith-rr1-p3-recpe-x8.c
@@ -0,0 +1,97 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-vsigmoid/neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x8(
+ size_t batch,
+ const void* input,
+ void* output,
+ const union xnn_f16_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(batch % sizeof(__fp16) == 0);
+
+ const float16x8_t vmagic_bias = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.magic_bias));
+ const float16x8_t vminus_log2e = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.minus_log2e));
+ const float16x8_t vln2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.ln2));
+ const float16x8_t vc3 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c3));
+ const float16x8_t vc2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c2));
+ const float16x8_t vone = vmovq_n_f16(1.0f);
+ const float16x8_t vdenorm_cutoff = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.denorm_cutoff));
+
+ const __fp16* i = (const __fp16*) input;
+ __fp16* o = (__fp16*) output;
+ for (; batch >= 8 * sizeof(__fp16); batch -= 8 * sizeof(__fp16)) {
+ const float16x8_t vx = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ const float16x8_t vr = vrecpeq_f16(vd);
+ float16x8_t vf = vmulq_f16(ve, vr);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ vst1q_f16(o, vf); o += 8;
+ }
+ if XNN_UNLIKELY(batch != 0) {
+ const float16x8_t vx = vld1q_f16(i);
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ const float16x8_t vr = vrecpeq_f16(vd);
+ float16x8_t vf = vmulq_f16(ve, vr);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ float16x4_t vf_lo = vget_low_f16(vf);
+ if (batch & (4 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 4;
+ vf_lo = vget_high_f16(vf);
+ }
+ if (batch & (2 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 2;
+ vf_lo = vext_f16(vf_lo, vf_lo, 2);
+ }
+ if (batch & (1 * sizeof(__fp16))) {
+ vst1_lane_f16(o, vf_lo, 0);
+ }
+ }
+}
diff --git a/src/f16-vsigmoid/neonfp16arith.c.in b/src/f16-vsigmoid/neonfp16arith.c.in
new file mode 100644
index 0000000..ed9b54f
--- /dev/null
+++ b/src/f16-vsigmoid/neonfp16arith.c.in
@@ -0,0 +1,161 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert BATCH_TILE % 8 == 0
+$assert BATCH_TILE >= 8
+$SIMD_TILE = BATCH_TILE // 8
+$assert DIV_ALGO in ["div", "recpe"]
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+
+
+void xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_${DIV_ALGO}_x${BATCH_TILE}(
+ size_t batch,
+ const void* input,
+ void* output,
+ const union xnn_f16_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(batch % sizeof(__fp16) == 0);
+
+ const float16x8_t vmagic_bias = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.magic_bias));
+ const float16x8_t vminus_log2e = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.minus_log2e));
+ const float16x8_t vln2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.ln2));
+ const float16x8_t vc3 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c3));
+ const float16x8_t vc2 = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.c2));
+ const float16x8_t vone = vmovq_n_f16(1.0f);
+ const float16x8_t vdenorm_cutoff = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neonfp16arith_rr1_p3.denorm_cutoff));
+
+ const __fp16* i = (const __fp16*) input;
+ __fp16* o = (__fp16*) output;
+ $if BATCH_TILE > 8:
+ for (; batch >= ${BATCH_TILE} * sizeof(__fp16); batch -= ${BATCH_TILE} * sizeof(__fp16)) {
+ $for N in range(SIMD_TILE):
+ const float16x8_t vx${ABC[N]} = vld1q_f16(i); i += 8;
+
+ $for N in range(SIMD_TILE):
+ const float16x8_t vz${ABC[N]} = vabsq_f16(vx${ABC[N]});
+
+ $for N in range(SIMD_TILE):
+ float16x8_t vn${ABC[N]} = vfmaq_f16(vmagic_bias, vz${ABC[N]}, vminus_log2e);
+
+ $for N in range(SIMD_TILE):
+ const float16x8_t vs${ABC[N]} = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn${ABC[N]}), 10));
+
+ $for N in range(SIMD_TILE):
+ vn${ABC[N]} = vsubq_f16(vn${ABC[N]}, vmagic_bias);
+
+ $for N in range(SIMD_TILE):
+ float16x8_t vt${ABC[N]} = vfmaq_f16(vz${ABC[N]}, vn${ABC[N]}, vln2);
+
+ $for N in range(SIMD_TILE):
+ float16x8_t vp${ABC[N]} = vfmaq_f16(vc2, vc3, vt${ABC[N]});
+
+ $for N in range(SIMD_TILE):
+ vp${ABC[N]} = vfmsq_f16(vone, vp${ABC[N]}, vt${ABC[N]});
+
+ $for N in range(SIMD_TILE):
+ vt${ABC[N]} = vmulq_f16(vt${ABC[N]}, vs${ABC[N]});
+
+ $for N in range(SIMD_TILE):
+ const float16x8_t ve${ABC[N]} = vfmsq_f16(vs${ABC[N]}, vp${ABC[N]}, vt${ABC[N]});
+
+ $for N in range(SIMD_TILE):
+ const float16x8_t vd${ABC[N]} = vaddq_f16(ve${ABC[N]}, vone);
+
+ $if DIV_ALGO == "div":
+ $for N in range(SIMD_TILE):
+ float16x8_t vf${ABC[N]} = vdivq_f16(ve${ABC[N]}, vd${ABC[N]});
+ $else:
+ $for N in range(SIMD_TILE):
+ float16x8_t vr${ABC[N]} = vrecpeq_f16(vd${ABC[N]});
+
+ $for N in range(SIMD_TILE):
+ float16x8_t vf${ABC[N]} = vmulq_f16(ve${ABC[N]}, vr${ABC[N]});
+
+ $for N in range(SIMD_TILE):
+ vf${ABC[N]} = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf${ABC[N]}), vcagtq_f16(vx${ABC[N]}, vdenorm_cutoff)));
+
+ $for N in range(SIMD_TILE):
+ const uint16x8_t vm${ABC[N]} = vcltq_f16(vx${ABC[N]}, vmovq_n_f16(0.0f));
+
+ $for N in range(SIMD_TILE):
+ vf${ABC[N]} = vbslq_f16(vm${ABC[N]}, vf${ABC[N]}, vsubq_f16(vone, vf${ABC[N]}));
+
+ $for N in range(SIMD_TILE):
+ vst1q_f16(o, vf${ABC[N]}); o += 8;
+ }
+ for (; batch >= 8 * sizeof(__fp16); batch -= 8 * sizeof(__fp16)) {
+ const float16x8_t vx = vld1q_f16(i); i += 8;
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ $if DIV_ALGO == "div":
+ float16x8_t vf = vdivq_f16(ve, vd);
+ $else:
+ const float16x8_t vr = vrecpeq_f16(vd);
+ float16x8_t vf = vmulq_f16(ve, vr);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ vst1q_f16(o, vf); o += 8;
+ }
+ if XNN_UNLIKELY(batch != 0) {
+ const float16x8_t vx = vld1q_f16(i);
+
+ const float16x8_t vz = vabsq_f16(vx);
+
+ float16x8_t vn = vfmaq_f16(vmagic_bias, vz, vminus_log2e);
+ const float16x8_t vs = vreinterpretq_f16_s16(vshlq_n_s16(vreinterpretq_s16_f16(vn), 10));
+ vn = vsubq_f16(vn, vmagic_bias);
+ float16x8_t vt = vfmaq_f16(vz, vn, vln2);
+
+ float16x8_t vp = vfmaq_f16(vc2, vc3, vt);
+ vp = vfmsq_f16(vone, vp, vt);
+
+ vt = vmulq_f16(vt, vs);
+ const float16x8_t ve = vfmsq_f16(vs, vp, vt);
+ const float16x8_t vd = vaddq_f16(ve, vone);
+
+ $if DIV_ALGO == "div":
+ float16x8_t vf = vdivq_f16(ve, vd);
+ $else:
+ const float16x8_t vr = vrecpeq_f16(vd);
+ float16x8_t vf = vmulq_f16(ve, vr);
+ vf = vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(vf), vcagtq_f16(vx, vdenorm_cutoff)));
+ const uint16x8_t vm = vcltq_f16(vx, vmovq_n_f16(0.0f));
+ vf = vbslq_f16(vm, vf, vsubq_f16(vone, vf));
+
+ float16x4_t vf_lo = vget_low_f16(vf);
+ if (batch & (4 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 4;
+ vf_lo = vget_high_f16(vf);
+ }
+ if (batch & (2 * sizeof(__fp16))) {
+ vst1_f16(o, vf_lo); o += 2;
+ vf_lo = vext_f16(vf_lo, vf_lo, 2);
+ }
+ if (batch & (1 * sizeof(__fp16))) {
+ vst1_lane_f16(o, vf_lo, 0);
+ }
+ }
+}
diff --git a/src/math/sigmoid-f16-neonfp16arith-rr1-p3-div.c b/src/math/sigmoid-f16-neonfp16arith-rr1-p3-div.c
index 64547bd..d9594e9 100644
--- a/src/math/sigmoid-f16-neonfp16arith-rr1-p3-div.c
+++ b/src/math/sigmoid-f16-neonfp16arith-rr1-p3-div.c
@@ -25,9 +25,9 @@
// Coefficient of polynomial approximation
// exp(-t) ~ 1 + t * (-1 + t * (c2 + t * c3))
// on [-log(2)/2, log(2)/2]
- const float16x8_t vone = vmovq_n_f16(1.0f);
- const float16x8_t vc2 = vmovq_n_f16(0x1.020p-1f);
const float16x8_t vc3 = vmovq_n_f16(-0x1.558p-3f);
+ const float16x8_t vc2 = vmovq_n_f16(0x1.020p-1f);
+ const float16x8_t vone = vmovq_n_f16(1.0f);
// The largest z for which sigmoidh(-z) is normalized.
// This number is also the largest z for which exph(-z) is normalized.
const float16x8_t vdenorm_cutoff = vmovq_n_f16(-0x1.368p+3f);
diff --git a/src/math/sigmoid-f16-neonfp16arith-rr1-p3-recpe.c b/src/math/sigmoid-f16-neonfp16arith-rr1-p3-recpe.c
index 8b3a3c0..70f4e96 100644
--- a/src/math/sigmoid-f16-neonfp16arith-rr1-p3-recpe.c
+++ b/src/math/sigmoid-f16-neonfp16arith-rr1-p3-recpe.c
@@ -25,9 +25,9 @@
// Coefficient of polynomial approximation
// exp(-t) ~ 1 + t * (-1 + t * (c2 + t * c3))
// on [-log(2)/2, log(2)/2]
- const float16x8_t vone = vmovq_n_f16(1.0f);
- const float16x8_t vc2 = vmovq_n_f16(0x1.020p-1f);
const float16x8_t vc3 = vmovq_n_f16(-0x1.558p-3f);
+ const float16x8_t vc2 = vmovq_n_f16(0x1.020p-1f);
+ const float16x8_t vone = vmovq_n_f16(1.0f);
// The largest z for which sigmoidh(-z) is normalized.
// This number is also the largest z for which exph(-z) is normalized.
const float16x8_t vdenorm_cutoff = vmovq_n_f16(-0x1.368p+3f);
diff --git a/src/math/sigmoid-f16-neonfp16arith-rr2-p3-div.c b/src/math/sigmoid-f16-neonfp16arith-rr2-p3-div.c
index 8c8de4a..c2fe60c 100644
--- a/src/math/sigmoid-f16-neonfp16arith-rr2-p3-div.c
+++ b/src/math/sigmoid-f16-neonfp16arith-rr2-p3-div.c
@@ -26,9 +26,9 @@
// Coefficient of polynomial approximation
// exp(-t) ~ 1 + t * (-1 + t * (c2 + t * c3))
// on [-log(2)/2, log(2)/2]
- const float16x8_t vone = vmovq_n_f16(1.0f);
- const float16x8_t vc2 = vmovq_n_f16(0x1.020p-1f);
const float16x8_t vc3 = vmovq_n_f16(-0x1.558p-3f);
+ const float16x8_t vc2 = vmovq_n_f16(0x1.020p-1f);
+ const float16x8_t vone = vmovq_n_f16(1.0f);
// The largest z for which sigmoidh(-z) is normalized.
// This number is also the largest z for which exph(-z) is normalized.
const float16x8_t vdenorm_cutoff = vmovq_n_f16(-0x1.368p+3f);
diff --git a/src/math/sigmoid-f16-neonfp16arith-rr2-p3-recpe.c b/src/math/sigmoid-f16-neonfp16arith-rr2-p3-recpe.c
index 2201729..b2a4d98 100644
--- a/src/math/sigmoid-f16-neonfp16arith-rr2-p3-recpe.c
+++ b/src/math/sigmoid-f16-neonfp16arith-rr2-p3-recpe.c
@@ -26,9 +26,9 @@
// Coefficient of polynomial approximation
// exp(-t) ~ 1 + t * (-1 + t * (c2 + t * c3))
// on [-log(2)/2, log(2)/2]
- const float16x8_t vone = vmovq_n_f16(1.0f);
- const float16x8_t vc2 = vmovq_n_f16(0x1.020p-1f);
const float16x8_t vc3 = vmovq_n_f16(-0x1.558p-3f);
+ const float16x8_t vc2 = vmovq_n_f16(0x1.020p-1f);
+ const float16x8_t vone = vmovq_n_f16(1.0f);
// The largest z for which sigmoidh(-z) is normalized.
// This number is also the largest z for which exph(-z) is normalized.
const float16x8_t vdenorm_cutoff = vmovq_n_f16(-0x1.368p+3f);
diff --git a/src/params-init.c b/src/params-init.c
index 31244ee..ee0877d 100644
--- a/src/params-init.c
+++ b/src/params-init.c
@@ -1955,6 +1955,19 @@
}
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+#if XNN_ARCH_ARM64
+void xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params(
+ union xnn_f16_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
+{
+ params->neonfp16arith_rr1_p3.magic_bias = UINT16_C(0x660F); // 0x1.83Cp+10h
+ params->neonfp16arith_rr1_p3.minus_log2e = UINT16_C(0xBDC5); // -0x1.714p+0h
+ params->neonfp16arith_rr1_p3.ln2 = UINT16_C(0x398C); // 0x1.630p-1h
+ params->neonfp16arith_rr1_p3.c3 = UINT16_C(0xB156); // -0x1.558p-3h
+ params->neonfp16arith_rr1_p3.c2 = UINT16_C(0x3808); // 0x1.020p-1h
+ params->neonfp16arith_rr1_p3.denorm_cutoff = UINT16_C(0xC8DA); // -0x1.368p+3h
+}
+#endif // XNN_ARCH_ARM64
+
void xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params(
union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
{
diff --git a/src/xnnpack/params-init.h b/src/xnnpack/params-init.h
index 853be8b..f785571 100644
--- a/src/xnnpack/params-init.h
+++ b/src/xnnpack/params-init.h
@@ -393,6 +393,15 @@
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+#define DECLARE_INIT_F16_SIGMOID_PARAMS_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ union xnn_f16_sigmoid_params params[XNN_MIN_ELEMENTS(1)]);
+
+#if XNN_ARCH_ARM64
+ DECLARE_INIT_F16_SIGMOID_PARAMS_FUNCTION(xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params)
+#endif // XNN_ARCH_ARM64
+
+
#define DECLARE_INIT_F32_SIGMOID_PARAMS_FUNCTION(fn_name) \
XNN_INTERNAL void fn_name( \
union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)]);
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 31f82a9..3ac3aeb 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -587,6 +587,20 @@
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
};
+union xnn_f16_sigmoid_params {
+ char _; // Dummy member variable to comply with the C standard
+#if XNN_ARCH_ARM64
+ struct {
+ uint16_t magic_bias;
+ uint16_t minus_log2e;
+ uint16_t ln2;
+ uint16_t c3;
+ uint16_t c2;
+ uint16_t denorm_cutoff;
+ } neonfp16arith_rr1_p3;
+#endif // XNN_ARCH_ARM64
+};
+
union xnn_f32_sigmoid_params {
struct {
float magic_bias;
@@ -3139,6 +3153,12 @@
float* y,
const union xnn_f32_rnd_params* params);
+typedef void (*xnn_f16_vsigmoid_ukernel_function)(
+ size_t n,
+ const void* x,
+ void* y,
+ const union xnn_f16_sigmoid_params* params);
+
typedef void (*xnn_f32_vsigmoid_ukernel_function)(
size_t n,
const float* x,
@@ -3606,6 +3626,9 @@
union xnn_f32_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
float scale);
+typedef void (*xnn_init_f16_sigmoid_params_fn)(
+ union xnn_f16_sigmoid_params params[XNN_MIN_ELEMENTS(1)]);
+
typedef void (*xnn_init_f32_sigmoid_params_fn)(
union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)]);
diff --git a/src/xnnpack/vunary.h b/src/xnnpack/vunary.h
index a0b7c12..e0ccacd 100644
--- a/src/xnnpack/vunary.h
+++ b/src/xnnpack/vunary.h
@@ -44,6 +44,32 @@
DECLARE_F16_VHSWISH_UKERNEL_FUNCTION(xnn_f16_vhswish_ukernel__f16c_x16)
+#define DECLARE_F16_VSIGMOID_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t batch, \
+ const void* input, \
+ void* output, \
+ const union xnn_f16_sigmoid_params* params);
+
+DECLARE_F16_VSIGMOID_UKERNEL_FUNCTION(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x8)
+DECLARE_F16_VSIGMOID_UKERNEL_FUNCTION(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x16)
+DECLARE_F16_VSIGMOID_UKERNEL_FUNCTION(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x24)
+DECLARE_F16_VSIGMOID_UKERNEL_FUNCTION(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x32)
+DECLARE_F16_VSIGMOID_UKERNEL_FUNCTION(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x40)
+DECLARE_F16_VSIGMOID_UKERNEL_FUNCTION(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x48)
+DECLARE_F16_VSIGMOID_UKERNEL_FUNCTION(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x56)
+DECLARE_F16_VSIGMOID_UKERNEL_FUNCTION(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x64)
+
+DECLARE_F16_VSIGMOID_UKERNEL_FUNCTION(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x8)
+DECLARE_F16_VSIGMOID_UKERNEL_FUNCTION(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x16)
+DECLARE_F16_VSIGMOID_UKERNEL_FUNCTION(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x24)
+DECLARE_F16_VSIGMOID_UKERNEL_FUNCTION(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x32)
+DECLARE_F16_VSIGMOID_UKERNEL_FUNCTION(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x40)
+DECLARE_F16_VSIGMOID_UKERNEL_FUNCTION(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x48)
+DECLARE_F16_VSIGMOID_UKERNEL_FUNCTION(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x56)
+DECLARE_F16_VSIGMOID_UKERNEL_FUNCTION(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x64)
+
+
#define DECLARE_F32_VABS_UKERNEL_FUNCTION(fn_name) \
XNN_INTERNAL void fn_name( \
size_t n, \
diff --git a/test/f16-vsigmoid.cc b/test/f16-vsigmoid.cc
new file mode 100644
index 0000000..04f8798
--- /dev/null
+++ b/test/f16-vsigmoid.cc
@@ -0,0 +1,769 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+// Specification: test/f16-vsigmoid.yaml
+// Generator: tools/generate-vunary-test.py
+
+
+#include <gtest/gtest.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
+#include <xnnpack/vunary.h>
+#include "vunary-microkernel-tester.h"
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X8, batch_eq_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ VUnaryMicrokernelTester()
+ .batch_size(8)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x8, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X8, batch_div_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x8, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X8, batch_lt_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x8, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X8, batch_gt_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x8, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X8, inplace) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x8, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X16, batch_eq_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ VUnaryMicrokernelTester()
+ .batch_size(16)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x16, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X16, batch_div_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x16, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X16, batch_lt_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x16, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X16, batch_gt_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x16, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X16, inplace) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x16, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X24, batch_eq_24) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ VUnaryMicrokernelTester()
+ .batch_size(24)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x24, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X24, batch_div_24) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x24, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X24, batch_lt_24) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size < 24; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x24, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X24, batch_gt_24) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 25; batch_size < 48; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x24, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X24, inplace) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x24, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X32, batch_eq_32) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ VUnaryMicrokernelTester()
+ .batch_size(32)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x32, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X32, batch_div_32) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x32, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X32, batch_lt_32) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x32, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X32, batch_gt_32) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x32, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X32, inplace) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x32, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X40, batch_eq_40) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ VUnaryMicrokernelTester()
+ .batch_size(40)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x40, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X40, batch_div_40) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 80; batch_size < 400; batch_size += 40) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x40, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X40, batch_lt_40) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size < 40; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x40, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X40, batch_gt_40) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 41; batch_size < 80; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x40, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X40, inplace) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size <= 200; batch_size += 39) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x40, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X48, batch_eq_48) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ VUnaryMicrokernelTester()
+ .batch_size(48)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x48, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X48, batch_div_48) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 96; batch_size < 480; batch_size += 48) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x48, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X48, batch_lt_48) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size < 48; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x48, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X48, batch_gt_48) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 49; batch_size < 96; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x48, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X48, inplace) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size <= 240; batch_size += 47) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x48, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X56, batch_eq_56) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ VUnaryMicrokernelTester()
+ .batch_size(56)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x56, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X56, batch_div_56) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 112; batch_size < 560; batch_size += 56) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x56, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X56, batch_lt_56) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size < 56; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x56, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X56, batch_gt_56) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 57; batch_size < 112; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x56, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X56, inplace) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size <= 280; batch_size += 55) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x56, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X64, batch_eq_64) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ VUnaryMicrokernelTester()
+ .batch_size(64)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x64, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X64, batch_div_64) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 128; batch_size < 640; batch_size += 64) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x64, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X64, batch_lt_64) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size < 64; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x64, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X64, batch_gt_64) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 65; batch_size < 128; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x64, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_DIV_X64, inplace) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size <= 320; batch_size += 63) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x64, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X8, batch_eq_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ VUnaryMicrokernelTester()
+ .batch_size(8)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x8, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X8, batch_div_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x8, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X8, batch_lt_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x8, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X8, batch_gt_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x8, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X8, inplace) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x8, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X16, batch_eq_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ VUnaryMicrokernelTester()
+ .batch_size(16)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x16, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X16, batch_div_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x16, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X16, batch_lt_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x16, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X16, batch_gt_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x16, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X16, inplace) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x16, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X24, batch_eq_24) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ VUnaryMicrokernelTester()
+ .batch_size(24)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x24, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X24, batch_div_24) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x24, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X24, batch_lt_24) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size < 24; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x24, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X24, batch_gt_24) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 25; batch_size < 48; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x24, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X24, inplace) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x24, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X32, batch_eq_32) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ VUnaryMicrokernelTester()
+ .batch_size(32)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x32, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X32, batch_div_32) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x32, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X32, batch_lt_32) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x32, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X32, batch_gt_32) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x32, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X32, inplace) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x32, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X40, batch_eq_40) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ VUnaryMicrokernelTester()
+ .batch_size(40)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x40, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X40, batch_div_40) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 80; batch_size < 400; batch_size += 40) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x40, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X40, batch_lt_40) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size < 40; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x40, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X40, batch_gt_40) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 41; batch_size < 80; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x40, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X40, inplace) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size <= 200; batch_size += 39) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x40, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X48, batch_eq_48) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ VUnaryMicrokernelTester()
+ .batch_size(48)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x48, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X48, batch_div_48) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 96; batch_size < 480; batch_size += 48) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x48, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X48, batch_lt_48) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size < 48; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x48, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X48, batch_gt_48) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 49; batch_size < 96; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x48, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X48, inplace) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size <= 240; batch_size += 47) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x48, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X56, batch_eq_56) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ VUnaryMicrokernelTester()
+ .batch_size(56)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x56, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X56, batch_div_56) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 112; batch_size < 560; batch_size += 56) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x56, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X56, batch_lt_56) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size < 56; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x56, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X56, batch_gt_56) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 57; batch_size < 112; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x56, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X56, inplace) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size <= 280; batch_size += 55) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x56, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X64, batch_eq_64) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ VUnaryMicrokernelTester()
+ .batch_size(64)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x64, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X64, batch_div_64) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 128; batch_size < 640; batch_size += 64) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x64, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X64, batch_lt_64) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size < 64; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x64, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X64, batch_gt_64) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 65; batch_size < 128; batch_size++) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x64, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+
+ TEST(F16_VSIGMOID__NEONFP16ARITH_RR1_P3_RECPE_X64, inplace) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t batch_size = 1; batch_size <= 320; batch_size += 63) {
+ VUnaryMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x64, xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params);
+ }
+ }
+#endif // XNN_ARCH_ARM64
diff --git a/test/f16-vsigmoid.yaml b/test/f16-vsigmoid.yaml
new file mode 100644
index 0000000..88e23d4
--- /dev/null
+++ b/test/f16-vsigmoid.yaml
@@ -0,0 +1,70 @@
+# Copyright 2022 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# ARM NEON
+- name: xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x8
+ init: xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params
+ arch:
+ - aarch64
+- name: xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x16
+ init: xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params
+ arch:
+ - aarch64
+- name: xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x24
+ init: xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params
+ arch:
+ - aarch64
+- name: xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x32
+ init: xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params
+ arch:
+ - aarch64
+- name: xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x40
+ init: xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params
+ arch:
+ - aarch64
+- name: xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x48
+ init: xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params
+ arch:
+ - aarch64
+- name: xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x56
+ init: xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params
+ arch:
+ - aarch64
+- name: xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_div_x64
+ init: xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params
+ arch:
+ - aarch64
+- name: xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x8
+ init: xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params
+ arch:
+ - aarch64
+- name: xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x16
+ init: xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params
+ arch:
+ - aarch64
+- name: xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x24
+ init: xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params
+ arch:
+ - aarch64
+- name: xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x32
+ init: xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params
+ arch:
+ - aarch64
+- name: xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x40
+ init: xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params
+ arch:
+ - aarch64
+- name: xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x48
+ init: xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params
+ arch:
+ - aarch64
+- name: xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x56
+ init: xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params
+ arch:
+ - aarch64
+- name: xnn_f16_vsigmoid_ukernel__neonfp16arith_rr1_p3_recpe_x64
+ init: xnn_init_f16_sigmoid_neonfp16arith_rr1_p3_params
+ arch:
+ - aarch64
diff --git a/test/vunary-microkernel-tester.h b/test/vunary-microkernel-tester.h
index eb9b339..e64719a 100644
--- a/test/vunary-microkernel-tester.h
+++ b/test/vunary-microkernel-tester.h
@@ -486,6 +486,49 @@
}
}
+ void Test(xnn_f16_vsigmoid_ukernel_function vsigmoid, xnn_init_f16_sigmoid_params_fn init_params) const {
+ std::random_device random_device;
+ auto rng = std::mt19937(random_device());
+ auto distribution = std::uniform_real_distribution<float>(-25.0f, 25.0f);
+ auto f32rng = std::bind(distribution, std::ref(rng));
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+
+ std::vector<uint16_t> x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint16_t));
+ std::vector<uint16_t> y(batch_size() + (inplace() ? XNN_EXTRA_BYTES / sizeof(uint16_t) : 0));
+ std::vector<float> y_ref(batch_size());
+ for (size_t iteration = 0; iteration < iterations(); iteration++) {
+ if (inplace()) {
+ std::generate(y.begin(), y.end(), std::ref(f16rng));
+ } else {
+ std::generate(x.begin(), x.end(), std::ref(f16rng));
+ std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
+ }
+ const uint16_t* x_data = inplace() ? y.data() : x.data();
+
+ // Compute reference results.
+ for (size_t i = 0; i < batch_size(); i++) {
+ const float e = std::exp(fp16_ieee_to_fp32_value(x_data[i]));
+ y_ref[i] = e / (1.0f + e);
+ }
+
+ // Prepare parameters.
+ union xnn_f16_sigmoid_params params;
+ init_params(¶ms);
+
+ // Call optimized micro-kernel.
+ vsigmoid(batch_size() * sizeof(uint16_t), x_data, y.data(), ¶ms);
+
+ // Verify results.
+ for (size_t i = 0; i < batch_size(); i++) {
+ ASSERT_NEAR(
+ fp16_ieee_to_fp32_value(y[i]),
+ y_ref[i],
+ std::max(1.0e-4f, std::abs(y_ref[i]) * 5.0e-3f))
+ << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << fp16_ieee_to_fp32_value(x[i]);
+ }
+ }
+ }
+
void Test(xnn_f32_vsigmoid_ukernel_function vsigmoid, xnn_init_f32_sigmoid_params_fn init_params) const {
std::random_device random_device;
auto rng = std::mt19937(random_device());