1 | /* Single-precision SVE inverse tan |
2 | |
3 | Copyright (C) 2023-2024 Free Software Foundation, Inc. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include "sv_math.h" |
21 | #include "poly_sve_f32.h" |
22 | |
23 | static const struct data |
24 | { |
25 | float32_t poly[8]; |
26 | float32_t pi_over_2; |
27 | } data = { |
28 | /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on |
29 | [2**-128, 1.0]. */ |
30 | .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f, |
31 | -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f }, |
32 | .pi_over_2 = 0x1.921fb6p+0f, |
33 | }; |
34 | |
35 | #define SignMask (0x80000000) |
36 | |
37 | /* Fast implementation of SVE atanf based on |
38 | atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using |
39 | z=-1/x and shift = pi/2. |
40 | Largest observed error is 2.9 ULP, close to +/-1.0: |
41 | _ZGVsMxv_atanf (0x1.0468f6p+0) got -0x1.967f06p-1 |
42 | want -0x1.967fp-1. */ |
43 | svfloat32_t SV_NAME_F1 (atan) (svfloat32_t x, const svbool_t pg) |
44 | { |
45 | const struct data *d = ptr_barrier (&data); |
46 | |
47 | /* No need to trigger special case. Small cases, infs and nans |
48 | are supported by our approximation technique. */ |
49 | svuint32_t ix = svreinterpret_u32 (x); |
50 | svuint32_t sign = svand_x (pg, ix, SignMask); |
51 | |
52 | /* Argument reduction: |
53 | y := arctan(x) for x < 1 |
54 | y := pi/2 + arctan(-1/x) for x > 1 |
55 | Hence, use z=-1/a if x>=1, otherwise z=a. */ |
56 | svbool_t red = svacgt (pg, x, 1.0f); |
57 | /* Avoid dependency in abs(x) in division (and comparison). */ |
58 | svfloat32_t z = svsel (red, svdiv_x (pg, sv_f32 (x: 1.0f), x), x); |
59 | /* Use absolute value only when needed (odd powers of z). */ |
60 | svfloat32_t az = svabs_x (pg, z); |
61 | az = svneg_m (az, red, az); |
62 | |
63 | /* Use split Estrin scheme for P(z^2) with deg(P)=7. */ |
64 | svfloat32_t z2 = svmul_x (pg, z, z); |
65 | svfloat32_t z4 = svmul_x (pg, z2, z2); |
66 | svfloat32_t z8 = svmul_x (pg, z4, z4); |
67 | |
68 | svfloat32_t y = sv_estrin_7_f32_x (pg, x: z2, x2: z4, x4: z8, poly: d->poly); |
69 | |
70 | /* y = shift + z + z^3 * P(z^2). */ |
71 | svfloat32_t z3 = svmul_x (pg, z2, az); |
72 | y = svmla_x (pg, az, z3, y); |
73 | |
74 | /* Apply shift as indicated by 'red' predicate. */ |
75 | y = svadd_m (red, y, sv_f32 (x: d->pi_over_2)); |
76 | |
77 | /* y = atan(x) if x>0, -atan(-x) otherwise. */ |
78 | return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); |
79 | } |
80 | |