pl/math: Add vector/Neon atanf

Successfully ran tests and benchmarks. New routine is accurate to 3 ulps.
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index baee70f..0b7a745 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -17,6 +17,7 @@
 double atan2 (double, double);
 double log10 (double);
 
+float __s_atanf (float);
 float __s_atan2f (float, float);
 float __s_erfcf (float);
 float __s_erff (float);
@@ -40,6 +41,7 @@
 #endif
 
 /* Vector functions following the base PCS.  */
+__f32x4_t __v_atanf (__f32x4_t);
 __f64x2_t __v_atan (__f64x2_t);
 __f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
 __f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
@@ -54,6 +56,7 @@
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
 
 /* Vector functions following the vector PCS.  */
+__vpcs __f32x4_t __vn_atanf (__f32x4_t);
 __vpcs __f64x2_t __vn_atan (__f64x2_t);
 __vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
@@ -65,6 +68,7 @@
 __vpcs __f64x2_t __vn_log10 (__f64x2_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
+__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
 __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
 __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
 __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
diff --git a/pl/math/s_atanf_3u.c b/pl/math/s_atanf_3u.c
new file mode 100644
index 0000000..4e8a2f7
--- /dev/null
+++ b/pl/math/s_atanf_3u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define SCALAR 1
+#include "v_atanf_3u.c"
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index f1455aa..0f8e0ca 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -5,6 +5,7 @@
  * Copyright (c) 2022, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
+F (atanf, -10.0, 10.0)
 {"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}},
 F (erfcf, -4.0, 10.0)
 F (erff, -4.0, 4.0)
@@ -17,6 +18,7 @@
 D (log10, 0.01, 11.1)
 
 #if WANT_VMATH
+F (__s_atanf, -10.0, 10.0)
 D (__s_atan, -10.0, 10.0)
 {"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}},
 {"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}},
@@ -27,6 +29,7 @@
 F (__s_log10f, 0.01, 11.1)
 D (__s_log10, 0.01, 11.1)
 #if __aarch64__
+VF (__v_atanf, -10.0, 10.0)
 VD (__v_atan, -10.0, 10.0)
 {"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}},
 {"__v_atan2", 'd', 'v', -10.0, 10.0, {.vd = __v_atan2_wrap}},
@@ -37,6 +40,9 @@
 VD (__v_log10, 0.01, 11.1)
 VF (__v_log10f, 0.01, 11.1)
 #ifdef __vpcs
+VNF (__vn_atanf, -10.0, 10.0)
+VNF (_ZGVnN4v_atanf, -10.0, 10.0)
+
 VND (__vn_atan, -10.0, 10.0)
 VND (_ZGVnN2v_atan, -10.0, 10.0)
 
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 66ac3bf..674d718 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -169,6 +169,14 @@
    1e6       1e32  40000
 '
 
+range_atanf='
+ -10.0       10.0  50000
+  -1.0        1.0  40000
+   0.0        1.0  40000
+   1.0      100.0  40000
+   1e6       1e32  40000
+'
+
 # error limits
 L_erfc=3.7
 L_erfcf=1.0
@@ -179,6 +187,7 @@
 L_atan2=2.9
 L_atan=3.0
 L_atan2f=3.0
+L_atanf=3.0
 
 while read G F R
 do
@@ -218,6 +227,10 @@
 log10  __vn_log10      $runvn
 log10  _ZGVnN2v_log10  $runvn
 
+atanf  __s_atanf       $runs
+atanf  __v_atanf       $runv
+atanf  __vn_atanf      $runvn
+atanf  _ZGVnN4v_atanf  $runvn
 atan2f __s_atan2f       $runs
 atan2f __v_atan2f       $runv
 atan2f __vn_atan2f      $runvn
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 5bc7411..40fae51 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -12,6 +12,7 @@
 D1 (erfc)
 D1 (log10)
 #if WANT_VMATH
+F (__s_atanf, __s_atanf, atan, mpfr_atan, 1, 1, f1, 0)
 F (__s_atan, __s_atan, atanl, mpfr_atan, 1, 0, d1, 0)
 F (__s_atan2f, __s_atan2f, atan2, mpfr_atan2, 2, 1, f2, 0)
 F (__s_atan2, __s_atan2, atan2l, mpfr_atan2, 2, 0, d2, 0)
@@ -22,6 +23,7 @@
 F (__s_log10f, __s_log10f, log10, mpfr_log10, 1, 1, f1, 0)
 F (__s_log10, __s_log10, log10l, mpfr_log10, 1, 0, d1, 0)
 #if __aarch64__
+F (__v_atanf, v_atanf, atan, mpfr_atan, 1, 1, f1, 1)
 F (__v_atan, v_atan, atanl, mpfr_atan, 1, 0, d1, 1)
 F (__v_atan2f, v_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
 F (__v_atan2, v_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
@@ -32,6 +34,7 @@
 F (__v_log10f, v_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__v_log10, v_log10, log10l, mpfr_log10, 1, 0, d1, 1)
 #ifdef __vpcs
+F (__vn_atanf, vn_atanf, atan, mpfr_atan, 1, 1, f1, 1)
 F (__vn_atan, vn_atan, atanl, mpfr_atan, 1, 0, d1, 1)
 F (__vn_atan2f, vn_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
 F (__vn_atan2, vn_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
@@ -41,6 +44,7 @@
 F (__vn_erfc, vn_erfc, erfcl, mpfr_erfc, 1, 0, d1, 1)
 F (__vn_log10f, vn_log10f, log10, mpfr_log10, 1, 1, f1, 1)
 F (__vn_log10, vn_log10, log10l, mpfr_log10, 1, 0, d1, 1)
+F (_ZGVnN4v_atanf, Z_atanf, atan, mpfr_atan, 1, 1, f1, 1)
 F (_ZGVnN2v_atan, Z_atan, atanl, mpfr_atan, 1, 0, d1, 1)
 F (_ZGVnN4vv_atan2f, Z_atan2f, atan2, mpfr_atan2, 2, 1, f2, 1)
 F (_ZGVnN2vv_atan2, Z_atan2, atan2l, mpfr_atan2, 2, 0, d2, 1)
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index 0603e45..fa4ba4c 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -14,6 +14,7 @@
 
 /* Wrappers for vector functions.  */
 #if __aarch64__ && WANT_VMATH
+static float v_atanf(float x) { return __v_atanf(argf(x))[0]; }
 static float v_atan2f(float x, float y) { return __v_atan2f(argf(x), argf(y))[0]; }
 static float v_erff(float x) { return __v_erff(argf(x))[0]; }
 static float v_erfcf(float x) { return __v_erfcf(argf(x))[0]; }
@@ -24,6 +25,7 @@
 static double v_erfc(double x) { return __v_erfc(argd(x))[0]; }
 static double v_log10(double x) { return __v_log10(argd(x))[0]; }
 #ifdef __vpcs
+static float vn_atanf(float x) { return __vn_atanf(argf(x))[0]; }
 static float vn_atan2f(float x, float y) { return __vn_atan2f(argf(x), argf(y))[0]; }
 static float vn_erff(float x) { return __vn_erff(argf(x))[0]; }
 static float vn_erfcf(float x) { return __vn_erfcf(argf(x))[0]; }
@@ -34,6 +36,7 @@
 static double vn_erfc(double x) { return __vn_erfc(argd(x))[0]; }
 static double vn_log10(double x) { return __vn_log10(argd(x))[0]; }
 
+static float Z_atanf(float x) { return _ZGVnN4v_atanf(argf(x))[0]; }
 static float Z_atan2f(float x, float y) { return _ZGVnN4vv_atan2f(argf(x), argf(y))[0]; }
 static float Z_erff(float x) { return _ZGVnN4v_erff(argf(x))[0]; }
 static float Z_erfcf(float x) { return _ZGVnN4v_erfcf(argf(x))[0]; }
diff --git a/pl/math/v_atanf_3u.c b/pl/math/v_atanf_3u.c
new file mode 100644
index 0000000..7c84244
--- /dev/null
+++ b/pl/math/v_atanf_3u.c
@@ -0,0 +1,49 @@
+/*
+ * Single-precision vector atan(x) function.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#if V_SUPPORTED
+
+#include "atanf_common.h"
+
+#define PiOver2 v_f32 (0x1.921fb6p+0f)
+#define AbsMask v_u32 (0x7fffffff)
+
+/* Fast implementation of vector atanf based on
+   atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
+   using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps:
+   v_atanf(0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1.  */
+VPCS_ATTR
+v_f32_t V_NAME (atanf) (v_f32_t x)
+{
+  /* No need to trigger special case. Small cases, infs and nans
+     are supported by our approximation technique.  */
+  v_u32_t ix = v_as_u32_f32 (x);
+  v_u32_t sign = ix & ~AbsMask;
+
+  /* Argument reduction:
+     y := arctan(x) for x < 1
+     y := pi/2 + arctan(-1/x) for x > 1
+     Hence, use z=-1/a if x>=1, otherwise z=a.  */
+  v_u32_t red = v_cagt_f32 (x, v_f32 (1.0));
+  /* Avoid dependency in abs(x) in division (and comparison).  */
+  v_f32_t z = v_sel_f32 (red, v_div_f32 (v_f32 (-1.0f), x), x);
+  v_f32_t shift = v_sel_f32 (red, PiOver2, v_f32 (0.0f));
+  /* Use absolute value only when needed (odd powers of z).  */
+  v_f32_t az = v_abs_f32 (z);
+  az = v_sel_f32 (red, -az, az);
+
+  /* Calculate the polynomial approximation.  */
+  v_f32_t y = eval_poly (z, az, shift);
+
+  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
+  y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign);
+
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/pl/math/vn_atanf_3u.c b/pl/math/vn_atanf_3u.c
new file mode 100644
index 0000000..17ba6b8
--- /dev/null
+++ b/pl/math/vn_atanf_3u.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_atanf.
+ *
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "include/mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_atanf, _ZGVnN4v_atanf)
+#include "v_atanf_3u.c"
+#endif