| 1 | /* Single-precision vector (SVE) tan function |
| 2 | |
| 3 | Copyright (C) 2023-2024 Free Software Foundation, Inc. |
| 4 | This file is part of the GNU C Library. |
| 5 | |
| 6 | The GNU C Library is free software; you can redistribute it and/or |
| 7 | modify it under the terms of the GNU Lesser General Public |
| 8 | License as published by the Free Software Foundation; either |
| 9 | version 2.1 of the License, or (at your option) any later version. |
| 10 | |
| 11 | The GNU C Library is distributed in the hope that it will be useful, |
| 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 14 | Lesser General Public License for more details. |
| 15 | |
| 16 | You should have received a copy of the GNU Lesser General Public |
| 17 | License along with the GNU C Library; if not, see |
| 18 | <https://www.gnu.org/licenses/>. */ |
| 19 | |
| 20 | #include "sv_math.h" |
| 21 | |
| 22 | static const struct data |
| 23 | { |
| 24 | float pio2_1, pio2_2, pio2_3, invpio2; |
| 25 | float c1, c3, c5; |
| 26 | float c0, c2, c4, range_val, shift; |
| 27 | } data = { |
| 28 | /* Coefficients generated using: |
| 29 | poly = fpminimax((tan(sqrt(x))-sqrt(x))/x^(3/2), |
| 30 | deg, |
| 31 | [|single ...|], |
| 32 | [a*a;b*b]); |
| 33 | optimize relative error |
| 34 | final prec : 23 bits |
| 35 | deg : 5 |
| 36 | a : 0x1p-126 ^ 2 |
| 37 | b : ((pi) / 0x1p2) ^ 2 |
| 38 | dirty rel error: 0x1.f7c2e4p-25 |
| 39 | dirty abs error: 0x1.f7c2ecp-25. */ |
| 40 | .c0 = 0x1.55555p-2, .c1 = 0x1.11166p-3, |
| 41 | .c2 = 0x1.b88a78p-5, .c3 = 0x1.7b5756p-6, |
| 42 | .c4 = 0x1.4ef4cep-8, .c5 = 0x1.0e1e74p-7, |
| 43 | |
| 44 | .pio2_1 = 0x1.921fb6p+0f, .pio2_2 = -0x1.777a5cp-25f, |
| 45 | .pio2_3 = -0x1.ee59dap-50f, .invpio2 = 0x1.45f306p-1f, |
| 46 | .range_val = 0x1p15f, .shift = 0x1.8p+23f |
| 47 | }; |
| 48 | |
| 49 | static svfloat32_t NOINLINE |
| 50 | special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp) |
| 51 | { |
| 52 | return sv_call_f32 (f: tanf, x, y, cmp); |
| 53 | } |
| 54 | |
| 55 | /* Fast implementation of SVE tanf. |
| 56 | Maximum error is 3.45 ULP: |
| 57 | SV_NAME_F1 (tan)(-0x1.e5f0cap+13) got 0x1.ff9856p-1 |
| 58 | want 0x1.ff9850p-1. */ |
| 59 | svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg) |
| 60 | { |
| 61 | const struct data *d = ptr_barrier (&data); |
| 62 | |
| 63 | svfloat32_t odd_coeffs = svld1rq (svptrue_b32 (), &d->c1); |
| 64 | svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->pio2_1); |
| 65 | |
| 66 | /* n = rint(x/(pi/2)). */ |
| 67 | svfloat32_t n = svrintn_x (pg, svmul_lane (x, pi_vals, 3)); |
| 68 | /* n is already a signed integer, simply convert it. */ |
| 69 | svint32_t in = svcvt_s32_x (pg, n); |
| 70 | /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ |
| 71 | svint32_t alt = svand_x (pg, in, 1); |
| 72 | svbool_t pred_alt = svcmpne (pg, alt, 0); |
| 73 | /* r = x - n * (pi/2) (range reduction into 0 .. pi/4). */ |
| 74 | svfloat32_t r; |
| 75 | r = svmls_lane (x, n, pi_vals, 0); |
| 76 | r = svmls_lane (r, n, pi_vals, 1); |
| 77 | r = svmls_lane (r, n, pi_vals, 2); |
| 78 | |
| 79 | /* If x lives in an interval, where |tan(x)| |
| 80 | - is finite, then use a polynomial approximation of the form |
| 81 | tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2). |
| 82 | - grows to infinity then use symmetries of tangent and the identity |
| 83 | tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use |
| 84 | the same polynomial approximation of tan as above. */ |
| 85 | |
| 86 | /* Perform additional reduction if required. */ |
| 87 | svfloat32_t z = svneg_m (r, pred_alt, r); |
| 88 | |
| 89 | /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4], |
| 90 | using Estrin on z^2. */ |
| 91 | svfloat32_t z2 = svmul_x (svptrue_b32 (), r, r); |
| 92 | svfloat32_t p01 = svmla_lane (sv_f32 (x: d->c0), z2, odd_coeffs, 0); |
| 93 | svfloat32_t p23 = svmla_lane (sv_f32 (x: d->c2), z2, odd_coeffs, 1); |
| 94 | svfloat32_t p45 = svmla_lane (sv_f32 (x: d->c4), z2, odd_coeffs, 2); |
| 95 | |
| 96 | svfloat32_t z4 = svmul_x (pg, z2, z2); |
| 97 | svfloat32_t p = svmla_x (pg, p01, z4, p23); |
| 98 | |
| 99 | svfloat32_t z8 = svmul_x (pg, z4, z4); |
| 100 | p = svmla_x (pg, p, z8, p45); |
| 101 | |
| 102 | svfloat32_t y = svmla_x (pg, z, p, svmul_x (pg, z, z2)); |
| 103 | |
| 104 | /* No need to pass pg to specialcase here since cmp is a strict subset, |
| 105 | guaranteed by the cmpge above. */ |
| 106 | |
| 107 | /* Determine whether input is too large to perform fast regression. */ |
| 108 | svbool_t cmp = svacge (pg, x, d->range_val); |
| 109 | if (__glibc_unlikely (svptest_any (pg, cmp))) |
| 110 | return special_case (x, y: svdivr_x (pg, y, 1.0f), cmp); |
| 111 | |
| 112 | svfloat32_t inv_y = svdivr_x (pg, y, 1.0f); |
| 113 | return svsel (pred_alt, inv_y, y); |
| 114 | } |
| 115 | |