1 | /* Single-precision vector (SVE) tan function |
2 | |
3 | Copyright (C) 2023-2024 Free Software Foundation, Inc. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include "sv_math.h" |
21 | |
22 | static const struct data |
23 | { |
24 | float pio2_1, pio2_2, pio2_3, invpio2; |
25 | float c1, c3, c5; |
26 | float c0, c2, c4, range_val, shift; |
27 | } data = { |
28 | /* Coefficients generated using: |
29 | poly = fpminimax((tan(sqrt(x))-sqrt(x))/x^(3/2), |
30 | deg, |
31 | [|single ...|], |
32 | [a*a;b*b]); |
33 | optimize relative error |
34 | final prec : 23 bits |
35 | deg : 5 |
36 | a : 0x1p-126 ^ 2 |
37 | b : ((pi) / 0x1p2) ^ 2 |
38 | dirty rel error: 0x1.f7c2e4p-25 |
39 | dirty abs error: 0x1.f7c2ecp-25. */ |
40 | .c0 = 0x1.55555p-2, .c1 = 0x1.11166p-3, |
41 | .c2 = 0x1.b88a78p-5, .c3 = 0x1.7b5756p-6, |
42 | .c4 = 0x1.4ef4cep-8, .c5 = 0x1.0e1e74p-7, |
43 | |
44 | .pio2_1 = 0x1.921fb6p+0f, .pio2_2 = -0x1.777a5cp-25f, |
45 | .pio2_3 = -0x1.ee59dap-50f, .invpio2 = 0x1.45f306p-1f, |
46 | .range_val = 0x1p15f, .shift = 0x1.8p+23f |
47 | }; |
48 | |
49 | static svfloat32_t NOINLINE |
50 | special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp) |
51 | { |
52 | return sv_call_f32 (f: tanf, x, y, cmp); |
53 | } |
54 | |
55 | /* Fast implementation of SVE tanf. |
56 | Maximum error is 3.45 ULP: |
57 | SV_NAME_F1 (tan)(-0x1.e5f0cap+13) got 0x1.ff9856p-1 |
58 | want 0x1.ff9850p-1. */ |
59 | svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg) |
60 | { |
61 | const struct data *d = ptr_barrier (&data); |
62 | |
63 | /* Determine whether input is too large to perform fast regression. */ |
64 | svbool_t cmp = svacge (pg, x, d->range_val); |
65 | |
66 | svfloat32_t odd_coeffs = svld1rq (svptrue_b32 (), &d->c1); |
67 | svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->pio2_1); |
68 | |
69 | /* n = rint(x/(pi/2)). */ |
70 | svfloat32_t q = svmla_lane (sv_f32 (x: d->shift), x, pi_vals, 3); |
71 | svfloat32_t n = svsub_x (pg, q, d->shift); |
72 | /* n is already a signed integer, simply convert it. */ |
73 | svint32_t in = svcvt_s32_x (pg, n); |
74 | /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ |
75 | svint32_t alt = svand_x (pg, in, 1); |
76 | svbool_t pred_alt = svcmpne (pg, alt, 0); |
77 | |
78 | /* r = x - n * (pi/2) (range reduction into 0 .. pi/4). */ |
79 | svfloat32_t r; |
80 | r = svmls_lane (x, n, pi_vals, 0); |
81 | r = svmls_lane (r, n, pi_vals, 1); |
82 | r = svmls_lane (r, n, pi_vals, 2); |
83 | |
84 | /* If x lives in an interval, where |tan(x)| |
85 | - is finite, then use a polynomial approximation of the form |
86 | tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2). |
87 | - grows to infinity then use symmetries of tangent and the identity |
88 | tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use |
89 | the same polynomial approximation of tan as above. */ |
90 | |
91 | /* Perform additional reduction if required. */ |
92 | svfloat32_t z = svneg_m (r, pred_alt, r); |
93 | |
94 | /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4], |
95 | using Estrin on z^2. */ |
96 | svfloat32_t z2 = svmul_x (pg, z, z); |
97 | svfloat32_t p01 = svmla_lane (sv_f32 (x: d->c0), z2, odd_coeffs, 0); |
98 | svfloat32_t p23 = svmla_lane (sv_f32 (x: d->c2), z2, odd_coeffs, 1); |
99 | svfloat32_t p45 = svmla_lane (sv_f32 (x: d->c4), z2, odd_coeffs, 2); |
100 | |
101 | svfloat32_t z4 = svmul_x (pg, z2, z2); |
102 | svfloat32_t p = svmla_x (pg, p01, z4, p23); |
103 | |
104 | svfloat32_t z8 = svmul_x (pg, z4, z4); |
105 | p = svmla_x (pg, p, z8, p45); |
106 | |
107 | svfloat32_t y = svmla_x (pg, z, p, svmul_x (pg, z, z2)); |
108 | |
109 | /* Transform result back, if necessary. */ |
110 | svfloat32_t inv_y = svdivr_x (pg, y, 1.0f); |
111 | |
112 | /* No need to pass pg to specialcase here since cmp is a strict subset, |
113 | guaranteed by the cmpge above. */ |
114 | if (__glibc_unlikely (svptest_any (pg, cmp))) |
115 | return special_case (x, y: svsel (pred_alt, inv_y, y), cmp); |
116 | |
117 | return svsel (pred_alt, inv_y, y); |
118 | } |
119 | |