pl/math: Remove non-FEXPA variants of SVE expf and exp2f Following the previous patch, more relaxed NaN handling in FEXPA means that the FEXPA variants are always faster and more accurate. The Neon-style algorithms have been completely removed.

commit: fe02a82840e2d24cb5e66722554184e0b3305e19 [log] [tgz]
author: Joe Ramsay <Joe.Ramsay@arm.com> Tue Jun 06 14:44:20 2023 +0100
committer: Joe Ramsay <joe.ramsay@arm.com> Tue Jun 06 14:44:20 2023 +0100
tree: 9627f15ae3046fad761948e7507fecf933fdcda0
parent: 6fd1120e3e5344334d4aac07dfb200895b7308a5 [diff]
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index 5c102d7..d110a2a 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h

@@ -564,13 +564,6 @@
   double logc[1 << V_LOG_TABLE_BITS];
 } __v_log_data HIDDEN;
 
-#ifndef SV_EXPF_USE_FEXPA
-# define SV_EXPF_USE_FEXPA 0
-#endif
-
-#ifndef SV_EXP2F_USE_FEXPA
-# define SV_EXP2F_USE_FEXPA 0
-#endif
 #define SV_EXP2F_POLY_ORDER 6
 extern const float __sv_exp2f_poly[SV_EXP2F_POLY_ORDER - 1] HIDDEN;
 

diff --git a/pl/math/sv_exp2f_2u.c b/pl/math/sv_exp2f_2u.c
index 666d651..5946959 100644
--- a/pl/math/sv_exp2f_2u.c
+++ b/pl/math/sv_exp2f_2u.c

@@ -8,12 +8,9 @@
 #include "sv_math.h"
 #include "pl_sig.h"
 #include "pl_test.h"
-#include "sv_expf_specialcase.h"
 
 #define C(i) __sv_exp2f_poly[i]
 
-#if SV_EXP2F_USE_FEXPA
-
 #define Shift sv_f32 (0x1.903f8p17f) /* 1.5*2^17 + 127.  */
 #define Thres                                                                  \
   (0x1.5d5e2ap+6f) /* Roughly 87.3. For x < -Thres, the result is subnormal    \
@@ -22,28 +19,14 @@
 static NOINLINE svfloat32_t
 special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
 {
-  /* The special-case handler from the Neon routine does not handle subnormals
-     in a way that is compatible with FEXPA. For the FEXPA variant we just fall
-     back to scalar exp2f.  */
   return sv_call_f32 (exp2f, x, y, special);
 }
-
-#else
-
-#define Shift sv_f32 (0x1.8p23f) /* 1.5 * 2^23.  */
-#define Thres (126.0f)
-
-#endif
 /* Single-precision SVE exp2f routine. Implements the Neon algorithm
    for exp2f from math/.
 
-   Worst case error with FEXPA enabled is 1.04 ULPs.
+   Worst case error is 1.04 ULPs.
    SV_NAME_F1 (exp2)(0x1.943b9p-1) got 0x1.ba7eb2p+0
-	  want 0x1.ba7ebp+0
-
-   Worst case error without FEXPA is 1.96 ULPs.
-   SV_NAME_F1 (exp2)(0x1.ff7338p-2) got 0x1.69e764p+0
-			    want 0x1.69e768p+0.  */
+				  want 0x1.ba7ebp+0.  */
 svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
 {
   /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
@@ -52,15 +35,8 @@
   svfloat32_t n = svsub_f32_x (pg, z, Shift);
   svfloat32_t r = svsub_f32_x (pg, x, n);
 
-#if SV_EXP2F_USE_FEXPA
   svbool_t is_special_case = svacgt_n_f32 (pg, x, Thres);
   svfloat32_t scale = svexpa_f32 (svreinterpret_u32_f32 (z));
-#else
-  svuint32_t e
-    = svlsl_n_u32_x (pg, svreinterpret_u32_s32 (svcvt_s32_f32_x (pg, n)), 23);
-  svbool_t is_special_case = svacgt_n_f32 (pg, n, Thres);
-  svfloat32_t scale = svreinterpret_f32_u32 (svadd_n_u32_x (pg, e, 0x3f800000));
-#endif
 
   /* Polynomial evaluation: poly(r) ~ exp(r)-1.  */
   svfloat32_t r2 = svmul_f32_x (pg, r, r);
@@ -71,21 +47,14 @@
   svfloat32_t poly = svmla_f32_x (pg, p, r2, q);
 
   if (unlikely (svptest_any (pg, is_special_case)))
-    {
-#if SV_EXP2F_USE_FEXPA
       return special_case (x, svmla_f32_x (pg, scale, scale, poly),
 			   is_special_case);
 
-#else
-      return __sv_expf_specialcase (pg, poly, n, e, is_special_case, scale);
-
-#endif
-    }
   return svmla_f32_x (pg, scale, scale, poly);
 }
 
 PL_SIG (SV, F, 1, exp2, -9.9, 9.9)
-PL_TEST_ULP (SV_NAME_F1 (exp2), 1.47)
+PL_TEST_ULP (SV_NAME_F1 (exp2), 0.55)
 PL_TEST_INTERVAL (SV_NAME_F1 (exp2), 0, Thres, 40000)
 PL_TEST_INTERVAL (SV_NAME_F1 (exp2), Thres, 1, 50000)
 PL_TEST_INTERVAL (SV_NAME_F1 (exp2), 1, Thres, 50000)

diff --git a/pl/math/sv_expf_2u.c b/pl/math/sv_expf_2u.c
index 0f4f95f..3d85d24 100644
--- a/pl/math/sv_expf_2u.c
+++ b/pl/math/sv_expf_2u.c

@@ -9,7 +9,6 @@
 #include "sv_estrinf.h"
 #include "pl_sig.h"
 #include "pl_test.h"
-#include "sv_expf_specialcase.h"
 
 static struct
 {
@@ -23,45 +22,24 @@
   .inv_ln2 = 0x1.715476p+0f,
   .ln2_hi = 0x1.62e4p-1f,
   .ln2_lo = 0x1.7f7d1cp-20f,
-#if SV_EXPF_USE_FEXPA
   /* 1.5*2^17 + 127.  */
   .shift = 0x1.903f8p17f,
   /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
      correctly by FEXPA.  */
   .thres = 0x1.5d5e2ap+6f,
-#else
-  /* 1.5 * 2^23.  */
-  .shift = 0x1.8p23f,
-  .thres = 126.0f,
-#endif
 };
 
 #define C(i) sv_f32 (data.poly[i])
 #define ExponentBias 0x3f800000
 
-#if SV_EXPF_USE_FEXPA
-
 static svfloat32_t NOINLINE
 special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
 {
-  /* The special-case handler from the Neon routine does not handle subnormals
-     in a way that is compatible with FEXPA. For the FEXPA variant we just fall
-     back to scalar expf.  */
   return sv_call_f32 (expf, x, y, special);
 }
 
-#endif
-
-/* Optimised single-precision SVE exp function. By default this is an SVE port
-   of the Neon algorithm from math/. Alternatively, enable a modification of
-   that algorithm that looks up scale using SVE FEXPA instruction with
-   SV_EXPF_USE_FEXPA.
-
-   Worst-case error of the default algorithm is 1.95 ulp:
-   SV_NAME_F1 (exp)(-0x1.4cb74ap+2) got 0x1.6a022cp-8
-				   want 0x1.6a023p-8.
-
-   Worst-case error when using FEXPA is 1.04 ulp:
+/* Optimised single-precision SVE exp function.
+   Worst-case error is 1.04 ulp:
    SV_NAME_F1 (exp)(0x1.a8eda4p+1) got 0x1.ba74bcp+4
 				  want 0x1.ba74bap+4.  */
 svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg)
@@ -82,15 +60,8 @@
   r = svmls_lane_f32 (r, n, invln2_and_ln2, 2);
 
 /* scale = 2^(n/N).  */
-#if SV_EXPF_USE_FEXPA
   svbool_t is_special_case = svacgt_n_f32 (pg, x, data.thres);
   svfloat32_t scale = svexpa_f32 (svreinterpret_u32_f32 (z));
-#else
-  svuint32_t e = svlsl_n_u32_x (pg, svreinterpret_u32_f32 (z), 23);
-  svbool_t is_special_case = svacgt_n_f32 (pg, n, data.thres);
-  svfloat32_t scale
-      = svreinterpret_f32_u32 (svadd_n_u32_x (pg, e, ExponentBias));
-#endif
 
   /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
   svfloat32_t r2 = svmul_f32_x (pg, r, r);
@@ -101,18 +72,14 @@
   svfloat32_t poly = svmla_f32_x (pg, p0, r2, p14);
 
   if (unlikely (svptest_any (pg, is_special_case)))
-#if SV_EXPF_USE_FEXPA
     return special_case (x, svmla_f32_x (pg, scale, scale, poly),
 			 is_special_case);
-#else
-    return __sv_expf_specialcase (pg, poly, n, e, is_special_case, scale);
-#endif
 
   return svmla_f32_x (pg, scale, scale, poly);
 }
 
 PL_SIG (SV, F, 1, exp, -9.9, 9.9)
-PL_TEST_ULP (SV_NAME_F1 (exp), 1.46)
+PL_TEST_ULP (SV_NAME_F1 (exp), 0.55)
 PL_TEST_INTERVAL (SV_NAME_F1 (exp), 0, 0x1p-23, 40000)
 PL_TEST_INTERVAL (SV_NAME_F1 (exp), 0x1p-23, 1, 50000)
 PL_TEST_INTERVAL (SV_NAME_F1 (exp), 1, 0x1p23, 50000)

diff --git a/pl/math/sv_expf_specialcase.h b/pl/math/sv_expf_specialcase.h
deleted file mode 100644
index 83b09fc..0000000
--- a/pl/math/sv_expf_specialcase.h
+++ /dev/null

@@ -1,55 +0,0 @@
-/*
- * Single-precision SVE special cas function for expf routines
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "math_config.h"
-#include "sv_math.h"
-
-#define ScaleThres 192.0f
-#define SpecialOffset 0x82000000
-#define SpecialBias 0x7f000000
-
-/* Special-case handler adapted from Neon variant for expf/exp2f.
-   Uses s, y and n to produce the final result (normal cases included).
-   It performs an update of all lanes!
-   Therefore:
-   - all previous computation need to be done on all lanes indicated by input
-     pg
-   - we cannot simply apply the special case to the special-case-activated
-     lanes. */
-static inline svfloat32_t
-__sv_expf_specialcase (svbool_t pg, svfloat32_t poly, svfloat32_t n,
-		       svuint32_t e, svbool_t p_cmp1, svfloat32_t scale)
-{
-  /* s=2^(n/N) may overflow, break it up into s=s1*s2,
-     such that exp = s + s*y can be computed as s1*(s2+s2*y)
-     and s1*s1 overflows only if n>0.  */
-
-  /* If n<=0 then set b to 0x820...0, 0 otherwise.  */
-  svbool_t p_sign = svcmple_n_f32 (pg, n, 0.0f); /* n <= 0.  */
-  svuint32_t b
-    = svdup_n_u32_z (p_sign, SpecialOffset); /* Inactive lanes set to 0.  */
-
-  /* Set s1 to generate overflow depending on sign of exponent n.  */
-  svfloat32_t s1 = svreinterpret_f32_u32 (
-    svadd_n_u32_x (pg, b, SpecialBias)); /* b + 0x7f000000.  */
-  /* Offset s to avoid overflow in final result if n is below threshold.  */
-  svfloat32_t s2 = svreinterpret_f32_u32 (svsub_u32_x (pg, e, b));
-
-  /* |n| > 192 => 2^(n/N) overflows.  */
-  svbool_t p_cmp2 = svacgt_n_f32 (pg, n, ScaleThres);
-
-  svfloat32_t r2 = svmul_f32_x (pg, s1, s1);
-  svfloat32_t r1 = svmla_f32_x (pg, s2, s2, poly);
-  r1 = svmul_f32_x (pg, r1, s1);
-  svfloat32_t r0 = svmla_f32_x (pg, scale, scale, poly);
-
-  /* Apply condition 1 then 2.
-     Returns r2 if cond2 is true, otherwise
-     if cond1 is true then return r1, otherwise return r0.  */
-  svfloat32_t r = svsel_f32 (p_cmp1, r1, r0);
-
-  return svsel_f32 (p_cmp2, r2, r);
-}
commit	fe02a82840e2d24cb5e66722554184e0b3305e19	[log] [tgz]
author	Joe Ramsay <Joe.Ramsay@arm.com>	Tue Jun 06 14:44:20 2023 +0100
committer	Joe Ramsay <joe.ramsay@arm.com>	Tue Jun 06 14:44:20 2023 +0100
tree	9627f15ae3046fad761948e7507fecf933fdcda0
parent	6fd1120e3e5344334d4aac07dfb200895b7308a5 [diff]