pl/math: Set fenv flags in Neon asinhf

Routine no longer relies on vector log1pf, as this has to become more
complex to deal with fenv itself. Instead we re-use a log1pf helper
from Neon atanhf which does no special-case handling, instead leaving
it all up to the main routine. We now just fall back to the scalar
routine for special-case handling. This uncovered a mistake in
asinhf's handling of NaNs, which has been fixed.
diff --git a/pl/math/asinhf_3u5.c b/pl/math/asinhf_3u5.c
index 10f9f31..8aa62ad 100644
--- a/pl/math/asinhf_3u5.c
+++ b/pl/math/asinhf_3u5.c
@@ -11,7 +11,6 @@
 #define Ln2 (0x1.62e4p-1f)
 #define One (0x3f8)
 #define ExpM12 (0x398)
-#define QNaN (0x7fc)
 
 #define C(i) __asinhf_data.coeffs[i]
 
@@ -45,10 +44,11 @@
   float ax = asfloat (ia);
   uint32_t sign = ix & ~AbsMask;
 
-  if (ia12 < ExpM12 || ia12 == QNaN)
-    {
-      return x;
-    }
+  if (unlikely (ia12 < ExpM12 || ia == 0x7f800000))
+    return x;
+
+  if (unlikely (ia12 >= 0x7f8))
+    return __math_invalidf (x);
 
   if (ia12 < One)
     {
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 484ebdf..ed45c73 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -766,10 +766,10 @@
 log1pf __v_log1pf      $runv
 log1pf __vn_log1pf     $runvn
 log1pf _ZGVnN4v_log1pf $runvn
-asinhf __s_asinhf      $runs
-asinhf __v_asinhf      $runv
-asinhf __vn_asinhf     $runvn
-asinhf _ZGVnN4v_asinhf $runvn
+asinhf __s_asinhf      $runs    fenv
+asinhf __v_asinhf      $runv    fenv
+asinhf __vn_asinhf     $runvn   fenv
+asinhf _ZGVnN4v_asinhf $runvn   fenv
 log2f  __s_log2f       $runs
 log2f  __v_log2f       $runv
 log2f  __vn_log2f      $runvn
diff --git a/pl/math/v_asinhf_2u7.c b/pl/math/v_asinhf_2u7.c
index 675b8a8..7bce7ff 100644
--- a/pl/math/v_asinhf_2u7.c
+++ b/pl/math/v_asinhf_2u7.c
@@ -11,34 +11,45 @@
 
 #define SignMask v_u32 (0x80000000)
 #define One v_f32 (1.0f)
-#define Ln2 v_f32 (0x1.62e43p-1f)
-#define SpecialBound v_u32 (0x5f800000) /* asuint(0x1p64).  */
+#define BigBound v_u32 (0x5f800000)  /* asuint(0x1p64).  */
+#define TinyBound v_u32 (0x30800000) /* asuint(0x1p-30).  */
+
+#include "v_log1pf_inline.h"
+
+static NOINLINE v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
+{
+  return v_call_f32 (asinhf, x, y, special);
+}
 
 /* Single-precision implementation of vector asinh(x), using vector log1p.
    Worst-case error is 2.66 ULP, at roughly +/-0.25:
    __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3.  */
 VPCS_ATTR v_f32_t V_NAME (asinhf) (v_f32_t x)
 {
-  v_f32_t ax = v_abs_f32 (x);
-  v_u32_t special = v_cond_u32 (v_as_u32_f32 (ax) >= SpecialBound);
-  v_u32_t sign = v_as_u32_f32 (x) & SignMask;
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t iax = ix & ~SignMask;
+  v_u32_t sign = ix & SignMask;
+  v_f32_t ax = v_as_f32_u32 (iax);
+  v_u32_t special = v_cond_u32 (iax >= BigBound);
+
+#if WANT_ERRNO
+  /* Sidestep tiny and large values to avoid inadvertently triggering
+     under/overflow.  */
+  special |= v_cond_u32 (iax < TinyBound);
+  if (unlikely (v_any_u32 (special)))
+    ax = v_sel_f32 (special, One, ax);
+#endif
 
   /* asinh(x) = log(x + sqrt(x * x + 1)).
      For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))).  */
   v_f32_t d = One + v_sqrt_f32 (ax * ax + One);
-  v_f32_t y = V_NAME (log1pf) (ax + ax * ax / d);
+  v_f32_t y = log1pf_inline (ax + ax * ax / d);
+  y = v_as_f32_u32 (sign | v_as_u32_f32 (y));
 
   if (unlikely (v_any_u32 (special)))
-    {
-      /* If |x| is too large, we cannot square it at low cost without overflow.
-	 At very large x, asinh(x) ~= log(2x) and log(x) ~= log1p(x), so we
-	 calculate asinh(x) as log1p(x) + log(2).  */
-      v_f32_t y_large = V_NAME (log1pf) (ax) + Ln2;
-      return v_as_f32_u32 (sign
-			   | v_as_u32_f32 (v_sel_f32 (special, y_large, y)));
-    }
-
-  return v_as_f32_u32 (sign | v_as_u32_f32 (y));
+    return specialcase (x, y, special);
+  return y;
 }
 VPCS_ALIAS
 
diff --git a/pl/math/v_atanhf_3u1.c b/pl/math/v_atanhf_3u1.c
index 54dcb9b..1e3a561 100644
--- a/pl/math/v_atanhf_3u1.c
+++ b/pl/math/v_atanhf_3u1.c
@@ -9,50 +9,13 @@
 
 #if V_SUPPORTED
 
+#include "v_log1pf_inline.h"
+
 #define AbsMask 0x7fffffff
 #define Half 0x3f000000
 #define One 0x3f800000
-#define Four 0x40800000
-#define Ln2 0x1.62e43p-1f
 #define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */
 
-#define C(i) v_f32 (__log1pf_data.coeffs[i])
-
-static inline v_f32_t
-eval_poly (v_f32_t m)
-{
-  /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme.  */
-  v_f32_t p_12 = v_fma_f32 (m, C (1), C (0));
-  v_f32_t p_34 = v_fma_f32 (m, C (3), C (2));
-  v_f32_t p_56 = v_fma_f32 (m, C (5), C (4));
-  v_f32_t p_78 = v_fma_f32 (m, C (7), C (6));
-
-  v_f32_t m2 = m * m;
-  v_f32_t p_02 = v_fma_f32 (m2, p_12, m);
-  v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34);
-  v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78);
-
-  v_f32_t m4 = m2 * m2;
-  v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02);
-
-  return v_fma_f32 (m4, m4 * p_79, p_06);
-}
-
-static inline v_f32_t
-log1pf_inline (v_f32_t x)
-{
-  /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
-     special-case handling. See that file for details of the algorithm.  */
-  v_f32_t m = x + 1.0f;
-  v_u32_t k = (v_as_u32_f32 (m) - 0x3f400000) & 0xff800000;
-  v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k);
-  v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - k)
-		    + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f));
-  v_f32_t p = eval_poly (m_scale);
-  v_f32_t scale_back = v_to_f32_u32 (k) * 0x1.0p-23f;
-  return v_fma_f32 (scale_back, v_f32 (Ln2), p);
-}
-
 /* Approximation for vector single-precision atanh(x) using modified log1p.
    The maximum error is 3.08 ULP:
    __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
diff --git a/pl/math/v_log1pf_inline.h b/pl/math/v_log1pf_inline.h
new file mode 100644
index 0000000..cf32b2a
--- /dev/null
+++ b/pl/math/v_log1pf_inline.h
@@ -0,0 +1,55 @@
+/*
+ * Helper for single-precision routines which calculate log(1 + x) and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_V_LOG1PF_INLINE_H
+#define PL_MATH_V_LOG1PF_INLINE_H
+
+#include "v_math.h"
+#include "math_config.h"
+
+#define Four 0x40800000
+#define Ln2 v_f32 (0x1.62e43p-1f)
+
+#define C(i) v_f32 (__log1pf_data.coeffs[i])
+
+static inline v_f32_t
+eval_poly (v_f32_t m)
+{
+  /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme.  */
+  v_f32_t p_12 = v_fma_f32 (m, C (1), C (0));
+  v_f32_t p_34 = v_fma_f32 (m, C (3), C (2));
+  v_f32_t p_56 = v_fma_f32 (m, C (5), C (4));
+  v_f32_t p_78 = v_fma_f32 (m, C (7), C (6));
+
+  v_f32_t m2 = m * m;
+  v_f32_t p_02 = v_fma_f32 (m2, p_12, m);
+  v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34);
+  v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78);
+
+  v_f32_t m4 = m2 * m2;
+  v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02);
+
+  return v_fma_f32 (m4, m4 * p_79, p_06);
+}
+
+static inline v_f32_t
+log1pf_inline (v_f32_t x)
+{
+  /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
+     special-case handling. See that file for details of the algorithm.  */
+  v_f32_t m = x + 1.0f;
+  v_u32_t k = (v_as_u32_f32 (m) - 0x3f400000) & 0xff800000;
+  v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k);
+  v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - k)
+		    + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f));
+  v_f32_t p = eval_poly (m_scale);
+  v_f32_t scale_back = v_to_f32_u32 (k) * 0x1.0p-23f;
+  return v_fma_f32 (scale_back, Ln2, p);
+}
+
+#endif //  PL_MATH_V_LOG1PF_INLINE_H