pl/math: Remove non-FEXPA variants of SVE expf and exp2f
Following the previous patch, more relaxed NaN handling in FEXPA means
that the FEXPA variants are always faster and more accurate. The
Neon-style algorithms have been completely removed.
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 5c102d7..d110a2a 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -564,13 +564,6 @@
double logc[1 << V_LOG_TABLE_BITS];
} __v_log_data HIDDEN;
-#ifndef SV_EXPF_USE_FEXPA
-# define SV_EXPF_USE_FEXPA 0
-#endif
-
-#ifndef SV_EXP2F_USE_FEXPA
-# define SV_EXP2F_USE_FEXPA 0
-#endif
#define SV_EXP2F_POLY_ORDER 6
extern const float __sv_exp2f_poly[SV_EXP2F_POLY_ORDER - 1] HIDDEN;
diff --git a/pl/math/sv_exp2f_2u.c b/pl/math/sv_exp2f_2u.c
index 666d651..5946959 100644
--- a/pl/math/sv_exp2f_2u.c
+++ b/pl/math/sv_exp2f_2u.c
@@ -8,12 +8,9 @@
#include "sv_math.h"
#include "pl_sig.h"
#include "pl_test.h"
-#include "sv_expf_specialcase.h"
#define C(i) __sv_exp2f_poly[i]
-#if SV_EXP2F_USE_FEXPA
-
#define Shift sv_f32 (0x1.903f8p17f) /* 1.5*2^17 + 127. */
#define Thres \
(0x1.5d5e2ap+6f) /* Roughly 87.3. For x < -Thres, the result is subnormal \
@@ -22,28 +19,14 @@
static NOINLINE svfloat32_t
special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
{
- /* The special-case handler from the Neon routine does not handle subnormals
- in a way that is compatible with FEXPA. For the FEXPA variant we just fall
- back to scalar exp2f. */
return sv_call_f32 (exp2f, x, y, special);
}
-
-#else
-
-#define Shift sv_f32 (0x1.8p23f) /* 1.5 * 2^23. */
-#define Thres (126.0f)
-
-#endif
/* Single-precision SVE exp2f routine. Implements the Neon algorithm
for exp2f from math/.
- Worst case error with FEXPA enabled is 1.04 ULPs.
+ Worst case error is 1.04 ULPs.
SV_NAME_F1 (exp2)(0x1.943b9p-1) got 0x1.ba7eb2p+0
- want 0x1.ba7ebp+0
-
- Worst case error without FEXPA is 1.96 ULPs.
- SV_NAME_F1 (exp2)(0x1.ff7338p-2) got 0x1.69e764p+0
- want 0x1.69e768p+0. */
+ want 0x1.ba7ebp+0. */
svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
{
/* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
@@ -52,15 +35,8 @@
svfloat32_t n = svsub_f32_x (pg, z, Shift);
svfloat32_t r = svsub_f32_x (pg, x, n);
-#if SV_EXP2F_USE_FEXPA
svbool_t is_special_case = svacgt_n_f32 (pg, x, Thres);
svfloat32_t scale = svexpa_f32 (svreinterpret_u32_f32 (z));
-#else
- svuint32_t e
- = svlsl_n_u32_x (pg, svreinterpret_u32_s32 (svcvt_s32_f32_x (pg, n)), 23);
- svbool_t is_special_case = svacgt_n_f32 (pg, n, Thres);
- svfloat32_t scale = svreinterpret_f32_u32 (svadd_n_u32_x (pg, e, 0x3f800000));
-#endif
/* Polynomial evaluation: poly(r) ~ exp(r)-1. */
svfloat32_t r2 = svmul_f32_x (pg, r, r);
@@ -71,21 +47,14 @@
svfloat32_t poly = svmla_f32_x (pg, p, r2, q);
if (unlikely (svptest_any (pg, is_special_case)))
- {
-#if SV_EXP2F_USE_FEXPA
return special_case (x, svmla_f32_x (pg, scale, scale, poly),
is_special_case);
-#else
- return __sv_expf_specialcase (pg, poly, n, e, is_special_case, scale);
-
-#endif
- }
return svmla_f32_x (pg, scale, scale, poly);
}
PL_SIG (SV, F, 1, exp2, -9.9, 9.9)
-PL_TEST_ULP (SV_NAME_F1 (exp2), 1.47)
+PL_TEST_ULP (SV_NAME_F1 (exp2), 0.55)
PL_TEST_INTERVAL (SV_NAME_F1 (exp2), 0, Thres, 40000)
PL_TEST_INTERVAL (SV_NAME_F1 (exp2), Thres, 1, 50000)
PL_TEST_INTERVAL (SV_NAME_F1 (exp2), 1, Thres, 50000)
diff --git a/pl/math/sv_expf_2u.c b/pl/math/sv_expf_2u.c
index 0f4f95f..3d85d24 100644
--- a/pl/math/sv_expf_2u.c
+++ b/pl/math/sv_expf_2u.c
@@ -9,7 +9,6 @@
#include "sv_estrinf.h"
#include "pl_sig.h"
#include "pl_test.h"
-#include "sv_expf_specialcase.h"
static struct
{
@@ -23,45 +22,24 @@
.inv_ln2 = 0x1.715476p+0f,
.ln2_hi = 0x1.62e4p-1f,
.ln2_lo = 0x1.7f7d1cp-20f,
-#if SV_EXPF_USE_FEXPA
/* 1.5*2^17 + 127. */
.shift = 0x1.903f8p17f,
/* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
correctly by FEXPA. */
.thres = 0x1.5d5e2ap+6f,
-#else
- /* 1.5 * 2^23. */
- .shift = 0x1.8p23f,
- .thres = 126.0f,
-#endif
};
#define C(i) sv_f32 (data.poly[i])
#define ExponentBias 0x3f800000
-#if SV_EXPF_USE_FEXPA
-
static svfloat32_t NOINLINE
special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
{
- /* The special-case handler from the Neon routine does not handle subnormals
- in a way that is compatible with FEXPA. For the FEXPA variant we just fall
- back to scalar expf. */
return sv_call_f32 (expf, x, y, special);
}
-#endif
-
-/* Optimised single-precision SVE exp function. By default this is an SVE port
- of the Neon algorithm from math/. Alternatively, enable a modification of
- that algorithm that looks up scale using SVE FEXPA instruction with
- SV_EXPF_USE_FEXPA.
-
- Worst-case error of the default algorithm is 1.95 ulp:
- SV_NAME_F1 (exp)(-0x1.4cb74ap+2) got 0x1.6a022cp-8
- want 0x1.6a023p-8.
-
- Worst-case error when using FEXPA is 1.04 ulp:
+/* Optimised single-precision SVE exp function.
+ Worst-case error is 1.04 ulp:
SV_NAME_F1 (exp)(0x1.a8eda4p+1) got 0x1.ba74bcp+4
want 0x1.ba74bap+4. */
svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg)
@@ -82,15 +60,8 @@
r = svmls_lane_f32 (r, n, invln2_and_ln2, 2);
/* scale = 2^(n/N). */
-#if SV_EXPF_USE_FEXPA
svbool_t is_special_case = svacgt_n_f32 (pg, x, data.thres);
svfloat32_t scale = svexpa_f32 (svreinterpret_u32_f32 (z));
-#else
- svuint32_t e = svlsl_n_u32_x (pg, svreinterpret_u32_f32 (z), 23);
- svbool_t is_special_case = svacgt_n_f32 (pg, n, data.thres);
- svfloat32_t scale
- = svreinterpret_f32_u32 (svadd_n_u32_x (pg, e, ExponentBias));
-#endif
/* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
svfloat32_t r2 = svmul_f32_x (pg, r, r);
@@ -101,18 +72,14 @@
svfloat32_t poly = svmla_f32_x (pg, p0, r2, p14);
if (unlikely (svptest_any (pg, is_special_case)))
-#if SV_EXPF_USE_FEXPA
return special_case (x, svmla_f32_x (pg, scale, scale, poly),
is_special_case);
-#else
- return __sv_expf_specialcase (pg, poly, n, e, is_special_case, scale);
-#endif
return svmla_f32_x (pg, scale, scale, poly);
}
PL_SIG (SV, F, 1, exp, -9.9, 9.9)
-PL_TEST_ULP (SV_NAME_F1 (exp), 1.46)
+PL_TEST_ULP (SV_NAME_F1 (exp), 0.55)
PL_TEST_INTERVAL (SV_NAME_F1 (exp), 0, 0x1p-23, 40000)
PL_TEST_INTERVAL (SV_NAME_F1 (exp), 0x1p-23, 1, 50000)
PL_TEST_INTERVAL (SV_NAME_F1 (exp), 1, 0x1p23, 50000)
diff --git a/pl/math/sv_expf_specialcase.h b/pl/math/sv_expf_specialcase.h
deleted file mode 100644
index 83b09fc..0000000
--- a/pl/math/sv_expf_specialcase.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Single-precision SVE special cas function for expf routines
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "math_config.h"
-#include "sv_math.h"
-
-#define ScaleThres 192.0f
-#define SpecialOffset 0x82000000
-#define SpecialBias 0x7f000000
-
-/* Special-case handler adapted from Neon variant for expf/exp2f.
- Uses s, y and n to produce the final result (normal cases included).
- It performs an update of all lanes!
- Therefore:
- - all previous computation need to be done on all lanes indicated by input
- pg
- - we cannot simply apply the special case to the special-case-activated
- lanes. */
-static inline svfloat32_t
-__sv_expf_specialcase (svbool_t pg, svfloat32_t poly, svfloat32_t n,
- svuint32_t e, svbool_t p_cmp1, svfloat32_t scale)
-{
- /* s=2^(n/N) may overflow, break it up into s=s1*s2,
- such that exp = s + s*y can be computed as s1*(s2+s2*y)
- and s1*s1 overflows only if n>0. */
-
- /* If n<=0 then set b to 0x820...0, 0 otherwise. */
- svbool_t p_sign = svcmple_n_f32 (pg, n, 0.0f); /* n <= 0. */
- svuint32_t b
- = svdup_n_u32_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */
-
- /* Set s1 to generate overflow depending on sign of exponent n. */
- svfloat32_t s1 = svreinterpret_f32_u32 (
- svadd_n_u32_x (pg, b, SpecialBias)); /* b + 0x7f000000. */
- /* Offset s to avoid overflow in final result if n is below threshold. */
- svfloat32_t s2 = svreinterpret_f32_u32 (svsub_u32_x (pg, e, b));
-
- /* |n| > 192 => 2^(n/N) overflows. */
- svbool_t p_cmp2 = svacgt_n_f32 (pg, n, ScaleThres);
-
- svfloat32_t r2 = svmul_f32_x (pg, s1, s1);
- svfloat32_t r1 = svmla_f32_x (pg, s2, s2, poly);
- r1 = svmul_f32_x (pg, r1, s1);
- svfloat32_t r0 = svmla_f32_x (pg, scale, scale, poly);
-
- /* Apply condition 1 then 2.
- Returns r2 if cond2 is true, otherwise
- if cond1 is true then return r1, otherwise return r0. */
- svfloat32_t r = svsel_f32 (p_cmp1, r1, r0);
-
- return svsel_f32 (p_cmp2, r2, r);
-}