1/* Helpers for evaluating polynomials with various schemes - specific to SVE
2 but precision-agnostic.
3
4 Copyright (C) 2023-2024 Free Software Foundation, Inc.
5 This file is part of the GNU C Library.
6
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
11
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library; if not, see
19 <https://www.gnu.org/licenses/>. */
20
21#ifndef VTYPE
22# error Cannot use poly_generic without defining VTYPE
23#endif
24#ifndef STYPE
25# error Cannot use poly_generic without defining STYPE
26#endif
27#ifndef VWRAP
28# error Cannot use poly_generic without defining VWRAP
29#endif
30#ifndef DUP
31# error Cannot use poly_generic without defining DUP
32#endif
33
34static inline VTYPE VWRAP (pairwise_poly_3) (svbool_t pg, VTYPE x, VTYPE x2,
35 const STYPE *poly)
36{
37 /* At order 3, Estrin and Pairwise Horner are identical. */
38 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
39 VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]);
40 return svmla_x (pg, p01, p23, x2);
41}
42
43static inline VTYPE VWRAP (estrin_4) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
44 const STYPE *poly)
45{
46 VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly);
47 return svmla_x (pg, p03, x4, poly[4]);
48}
49static inline VTYPE VWRAP (estrin_5) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
50 const STYPE *poly)
51{
52 VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly);
53 VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]);
54 return svmla_x (pg, p03, p45, x4);
55}
56static inline VTYPE VWRAP (estrin_6) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
57 const STYPE *poly)
58{
59 VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly);
60 VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]);
61 VTYPE p46 = svmla_x (pg, p45, x, poly[6]);
62 return svmla_x (pg, p03, p46, x4);
63}
64static inline VTYPE VWRAP (estrin_7) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
65 const STYPE *poly)
66{
67 VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly);
68 VTYPE p47 = VWRAP (pairwise_poly_3) (pg, x, x2, poly: poly + 4);
69 return svmla_x (pg, p03, p47, x4);
70}
71static inline VTYPE VWRAP (estrin_8) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
72 VTYPE x8, const STYPE *poly)
73{
74 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), x8, poly[8]);
75}
76static inline VTYPE VWRAP (estrin_9) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
77 VTYPE x8, const STYPE *poly)
78{
79 VTYPE p89 = svmla_x (pg, DUP (poly[8]), x, poly[9]);
80 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p89, x8);
81}
82static inline VTYPE VWRAP (estrin_10) (svbool_t pg, VTYPE x, VTYPE x2,
83 VTYPE x4, VTYPE x8, const STYPE *poly)
84{
85 VTYPE p89 = svmla_x (pg, DUP (poly[8]), x, poly[9]);
86 VTYPE p8_10 = svmla_x (pg, p89, x2, poly[10]);
87 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p8_10, x8);
88}
89static inline VTYPE VWRAP (estrin_11) (svbool_t pg, VTYPE x, VTYPE x2,
90 VTYPE x4, VTYPE x8, const STYPE *poly)
91{
92 VTYPE p8_11 = VWRAP (pairwise_poly_3) (pg, x, x2, poly: poly + 8);
93 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p8_11, x8);
94}
95static inline VTYPE VWRAP (estrin_12) (svbool_t pg, VTYPE x, VTYPE x2,
96 VTYPE x4, VTYPE x8, const STYPE *poly)
97{
98 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly),
99 VWRAP (estrin_4) (pg, x, x2, x4, poly: poly + 8), x8);
100}
101static inline VTYPE VWRAP (estrin_13) (svbool_t pg, VTYPE x, VTYPE x2,
102 VTYPE x4, VTYPE x8, const STYPE *poly)
103{
104 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly),
105 VWRAP (estrin_5) (pg, x, x2, x4, poly: poly + 8), x8);
106}
107static inline VTYPE VWRAP (estrin_14) (svbool_t pg, VTYPE x, VTYPE x2,
108 VTYPE x4, VTYPE x8, const STYPE *poly)
109{
110 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly),
111 VWRAP (estrin_6) (pg, x, x2, x4, poly: poly + 8), x8);
112}
113static inline VTYPE VWRAP (estrin_15) (svbool_t pg, VTYPE x, VTYPE x2,
114 VTYPE x4, VTYPE x8, const STYPE *poly)
115{
116 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly),
117 VWRAP (estrin_7) (pg, x, x2, x4, poly: poly + 8), x8);
118}
119static inline VTYPE VWRAP (estrin_16) (svbool_t pg, VTYPE x, VTYPE x2,
120 VTYPE x4, VTYPE x8, VTYPE x16,
121 const STYPE *poly)
122{
123 return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), x16,
124 poly[16]);
125}
126static inline VTYPE VWRAP (estrin_17) (svbool_t pg, VTYPE x, VTYPE x2,
127 VTYPE x4, VTYPE x8, VTYPE x16,
128 const STYPE *poly)
129{
130 VTYPE p16_17 = svmla_x (pg, DUP (poly[16]), x, poly[17]);
131 return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), p16_17,
132 x16);
133}
134static inline VTYPE VWRAP (estrin_18) (svbool_t pg, VTYPE x, VTYPE x2,
135 VTYPE x4, VTYPE x8, VTYPE x16,
136 const STYPE *poly)
137{
138 VTYPE p16_17 = svmla_x (pg, DUP (poly[16]), x, poly[17]);
139 VTYPE p16_18 = svmla_x (pg, p16_17, x2, poly[18]);
140 return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), p16_18,
141 x16);
142}
143static inline VTYPE VWRAP (estrin_19) (svbool_t pg, VTYPE x, VTYPE x2,
144 VTYPE x4, VTYPE x8, VTYPE x16,
145 const STYPE *poly)
146{
147 return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly),
148 VWRAP (pairwise_poly_3) (pg, x, x2, poly: poly + 16), x16);
149}
150
151static inline VTYPE VWRAP (horner_3) (svbool_t pg, VTYPE x, const STYPE *poly)
152{
153 VTYPE p = svmla_x (pg, DUP (poly[2]), x, poly[3]);
154 p = svmad_x (pg, x, p, poly[1]);
155 p = svmad_x (pg, x, p, poly[0]);
156 return p;
157}
158static inline VTYPE VWRAP (horner_4) (svbool_t pg, VTYPE x, const STYPE *poly)
159{
160 VTYPE p = svmla_x (pg, DUP (poly[3]), x, poly[4]);
161 p = svmad_x (pg, x, p, poly[2]);
162 p = svmad_x (pg, x, p, poly[1]);
163 p = svmad_x (pg, x, p, poly[0]);
164 return p;
165}
166static inline VTYPE VWRAP (horner_5) (svbool_t pg, VTYPE x, const STYPE *poly)
167{
168 return svmad_x (pg, x, VWRAP (horner_4) (pg, x, poly: poly + 1), poly[0]);
169}
170static inline VTYPE VWRAP (horner_6) (svbool_t pg, VTYPE x, const STYPE *poly)
171{
172 return svmad_x (pg, x, VWRAP (horner_5) (pg, x, poly: poly + 1), poly[0]);
173}
174static inline VTYPE VWRAP (horner_7) (svbool_t pg, VTYPE x, const STYPE *poly)
175{
176 return svmad_x (pg, x, VWRAP (horner_6) (pg, x, poly: poly + 1), poly[0]);
177}
178static inline VTYPE VWRAP (horner_8) (svbool_t pg, VTYPE x, const STYPE *poly)
179{
180 return svmad_x (pg, x, VWRAP (horner_7) (pg, x, poly: poly + 1), poly[0]);
181}
182static inline VTYPE VWRAP (horner_9) (svbool_t pg, VTYPE x, const STYPE *poly)
183{
184 return svmad_x (pg, x, VWRAP (horner_8) (pg, x, poly: poly + 1), poly[0]);
185}
186static inline VTYPE
187sv_horner_10_f32_x (svbool_t pg, VTYPE x, const STYPE *poly)
188{
189 return svmad_x (pg, x, VWRAP (horner_9) (pg, x, poly: poly + 1), poly[0]);
190}
191static inline VTYPE
192sv_horner_11_f32_x (svbool_t pg, VTYPE x, const STYPE *poly)
193{
194 return svmad_x (pg, x, sv_horner_10_f32_x (pg, x, poly: poly + 1), poly[0]);
195}
196static inline VTYPE
197sv_horner_12_f32_x (svbool_t pg, VTYPE x, const STYPE *poly)
198{
199 return svmad_x (pg, x, sv_horner_11_f32_x (pg, x, poly: poly + 1), poly[0]);
200}
201
202static inline VTYPE VWRAP (pw_horner_4) (svbool_t pg, VTYPE x, VTYPE x2,
203 const STYPE *poly)
204{
205 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
206 VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]);
207 VTYPE p;
208 p = svmla_x (pg, p23, x2, poly[4]);
209 p = svmla_x (pg, p01, x2, p);
210 return p;
211}
212static inline VTYPE VWRAP (pw_horner_5) (svbool_t pg, VTYPE x, VTYPE x2,
213 const STYPE *poly)
214{
215 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
216 VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]);
217 VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]);
218 VTYPE p;
219 p = svmla_x (pg, p23, x2, p45);
220 p = svmla_x (pg, p01, x2, p);
221 return p;
222}
223static inline VTYPE VWRAP (pw_horner_6) (svbool_t pg, VTYPE x, VTYPE x2,
224 const STYPE *poly)
225{
226 VTYPE p26 = VWRAP (pw_horner_4) (pg, x, x2, poly: poly + 2);
227 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
228 return svmla_x (pg, p01, x2, p26);
229}
230static inline VTYPE VWRAP (pw_horner_7) (svbool_t pg, VTYPE x, VTYPE x2,
231 const STYPE *poly)
232{
233 VTYPE p27 = VWRAP (pw_horner_5) (pg, x, x2, poly: poly + 2);
234 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
235 return svmla_x (pg, p01, x2, p27);
236}
237static inline VTYPE VWRAP (pw_horner_8) (svbool_t pg, VTYPE x, VTYPE x2,
238 const STYPE *poly)
239{
240 VTYPE p28 = VWRAP (pw_horner_6) (pg, x, x2, poly: poly + 2);
241 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
242 return svmla_x (pg, p01, x2, p28);
243}
244static inline VTYPE VWRAP (pw_horner_9) (svbool_t pg, VTYPE x, VTYPE x2,
245 const STYPE *poly)
246{
247 VTYPE p29 = VWRAP (pw_horner_7) (pg, x, x2, poly: poly + 2);
248 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
249 return svmla_x (pg, p01, x2, p29);
250}
251static inline VTYPE VWRAP (pw_horner_10) (svbool_t pg, VTYPE x, VTYPE x2,
252 const STYPE *poly)
253{
254 VTYPE p2_10 = VWRAP (pw_horner_8) (pg, x, x2, poly: poly + 2);
255 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
256 return svmla_x (pg, p01, x2, p2_10);
257}
258static inline VTYPE VWRAP (pw_horner_11) (svbool_t pg, VTYPE x, VTYPE x2,
259 const STYPE *poly)
260{
261 VTYPE p2_11 = VWRAP (pw_horner_9) (pg, x, x2, poly: poly + 2);
262 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
263 return svmla_x (pg, p01, x2, p2_11);
264}
265static inline VTYPE VWRAP (pw_horner_12) (svbool_t pg, VTYPE x, VTYPE x2,
266 const STYPE *poly)
267{
268 VTYPE p2_12 = VWRAP (pw_horner_10) (pg, x, x2, poly: poly + 2);
269 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
270 return svmla_x (pg, p01, x2, p2_12);
271}
272static inline VTYPE VWRAP (pw_horner_13) (svbool_t pg, VTYPE x, VTYPE x2,
273 const STYPE *poly)
274{
275 VTYPE p2_13 = VWRAP (pw_horner_11) (pg, x, x2, poly: poly + 2);
276 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
277 return svmla_x (pg, p01, x2, p2_13);
278}
279static inline VTYPE VWRAP (pw_horner_14) (svbool_t pg, VTYPE x, VTYPE x2,
280 const STYPE *poly)
281{
282 VTYPE p2_14 = VWRAP (pw_horner_12) (pg, x, x2, poly: poly + 2);
283 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
284 return svmla_x (pg, p01, x2, p2_14);
285}
286static inline VTYPE VWRAP (pw_horner_15) (svbool_t pg, VTYPE x, VTYPE x2,
287 const STYPE *poly)
288{
289 VTYPE p2_15 = VWRAP (pw_horner_13) (pg, x, x2, poly: poly + 2);
290 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
291 return svmla_x (pg, p01, x2, p2_15);
292}
293static inline VTYPE VWRAP (pw_horner_16) (svbool_t pg, VTYPE x, VTYPE x2,
294 const STYPE *poly)
295{
296 VTYPE p2_16 = VWRAP (pw_horner_14) (pg, x, x2, poly: poly + 2);
297 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
298 return svmla_x (pg, p01, x2, p2_16);
299}
300static inline VTYPE VWRAP (pw_horner_17) (svbool_t pg, VTYPE x, VTYPE x2,
301 const STYPE *poly)
302{
303 VTYPE p2_17 = VWRAP (pw_horner_15) (pg, x, x2, poly: poly + 2);
304 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
305 return svmla_x (pg, p01, x2, p2_17);
306}
307static inline VTYPE VWRAP (pw_horner_18) (svbool_t pg, VTYPE x, VTYPE x2,
308 const STYPE *poly)
309{
310 VTYPE p2_18 = VWRAP (pw_horner_16) (pg, x, x2, poly: poly + 2);
311 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
312 return svmla_x (pg, p01, x2, p2_18);
313}
314

source code of glibc/sysdeps/aarch64/fpu/poly_sve_generic.h