1 | /* Helpers for evaluating polynomials with various schemes - specific to SVE |
2 | but precision-agnostic. |
3 | |
4 | Copyright (C) 2023-2024 Free Software Foundation, Inc. |
5 | This file is part of the GNU C Library. |
6 | |
7 | The GNU C Library is free software; you can redistribute it and/or |
8 | modify it under the terms of the GNU Lesser General Public |
9 | License as published by the Free Software Foundation; either |
10 | version 2.1 of the License, or (at your option) any later version. |
11 | |
12 | The GNU C Library is distributed in the hope that it will be useful, |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | Lesser General Public License for more details. |
16 | |
17 | You should have received a copy of the GNU Lesser General Public |
18 | License along with the GNU C Library; if not, see |
19 | <https://www.gnu.org/licenses/>. */ |
20 | |
21 | #ifndef VTYPE |
22 | # error Cannot use poly_generic without defining VTYPE |
23 | #endif |
24 | #ifndef STYPE |
25 | # error Cannot use poly_generic without defining STYPE |
26 | #endif |
27 | #ifndef VWRAP |
28 | # error Cannot use poly_generic without defining VWRAP |
29 | #endif |
30 | #ifndef DUP |
31 | # error Cannot use poly_generic without defining DUP |
32 | #endif |
33 | |
34 | static inline VTYPE VWRAP (pairwise_poly_3) (svbool_t pg, VTYPE x, VTYPE x2, |
35 | const STYPE *poly) |
36 | { |
37 | /* At order 3, Estrin and Pairwise Horner are identical. */ |
38 | VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); |
39 | VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]); |
40 | return svmla_x (pg, p01, p23, x2); |
41 | } |
42 | |
43 | static inline VTYPE VWRAP (estrin_4) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, |
44 | const STYPE *poly) |
45 | { |
46 | VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly); |
47 | return svmla_x (pg, p03, x4, poly[4]); |
48 | } |
49 | static inline VTYPE VWRAP (estrin_5) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, |
50 | const STYPE *poly) |
51 | { |
52 | VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly); |
53 | VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]); |
54 | return svmla_x (pg, p03, p45, x4); |
55 | } |
56 | static inline VTYPE VWRAP (estrin_6) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, |
57 | const STYPE *poly) |
58 | { |
59 | VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly); |
60 | VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]); |
61 | VTYPE p46 = svmla_x (pg, p45, x, poly[6]); |
62 | return svmla_x (pg, p03, p46, x4); |
63 | } |
64 | static inline VTYPE VWRAP (estrin_7) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, |
65 | const STYPE *poly) |
66 | { |
67 | VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly); |
68 | VTYPE p47 = VWRAP (pairwise_poly_3) (pg, x, x2, poly: poly + 4); |
69 | return svmla_x (pg, p03, p47, x4); |
70 | } |
71 | static inline VTYPE VWRAP (estrin_8) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, |
72 | VTYPE x8, const STYPE *poly) |
73 | { |
74 | return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), x8, poly[8]); |
75 | } |
76 | static inline VTYPE VWRAP (estrin_9) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, |
77 | VTYPE x8, const STYPE *poly) |
78 | { |
79 | VTYPE p89 = svmla_x (pg, DUP (poly[8]), x, poly[9]); |
80 | return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p89, x8); |
81 | } |
82 | static inline VTYPE VWRAP (estrin_10) (svbool_t pg, VTYPE x, VTYPE x2, |
83 | VTYPE x4, VTYPE x8, const STYPE *poly) |
84 | { |
85 | VTYPE p89 = svmla_x (pg, DUP (poly[8]), x, poly[9]); |
86 | VTYPE p8_10 = svmla_x (pg, p89, x2, poly[10]); |
87 | return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p8_10, x8); |
88 | } |
89 | static inline VTYPE VWRAP (estrin_11) (svbool_t pg, VTYPE x, VTYPE x2, |
90 | VTYPE x4, VTYPE x8, const STYPE *poly) |
91 | { |
92 | VTYPE p8_11 = VWRAP (pairwise_poly_3) (pg, x, x2, poly: poly + 8); |
93 | return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p8_11, x8); |
94 | } |
95 | static inline VTYPE VWRAP (estrin_12) (svbool_t pg, VTYPE x, VTYPE x2, |
96 | VTYPE x4, VTYPE x8, const STYPE *poly) |
97 | { |
98 | return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), |
99 | VWRAP (estrin_4) (pg, x, x2, x4, poly: poly + 8), x8); |
100 | } |
101 | static inline VTYPE VWRAP (estrin_13) (svbool_t pg, VTYPE x, VTYPE x2, |
102 | VTYPE x4, VTYPE x8, const STYPE *poly) |
103 | { |
104 | return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), |
105 | VWRAP (estrin_5) (pg, x, x2, x4, poly: poly + 8), x8); |
106 | } |
107 | static inline VTYPE VWRAP (estrin_14) (svbool_t pg, VTYPE x, VTYPE x2, |
108 | VTYPE x4, VTYPE x8, const STYPE *poly) |
109 | { |
110 | return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), |
111 | VWRAP (estrin_6) (pg, x, x2, x4, poly: poly + 8), x8); |
112 | } |
113 | static inline VTYPE VWRAP (estrin_15) (svbool_t pg, VTYPE x, VTYPE x2, |
114 | VTYPE x4, VTYPE x8, const STYPE *poly) |
115 | { |
116 | return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), |
117 | VWRAP (estrin_7) (pg, x, x2, x4, poly: poly + 8), x8); |
118 | } |
119 | static inline VTYPE VWRAP (estrin_16) (svbool_t pg, VTYPE x, VTYPE x2, |
120 | VTYPE x4, VTYPE x8, VTYPE x16, |
121 | const STYPE *poly) |
122 | { |
123 | return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), x16, |
124 | poly[16]); |
125 | } |
126 | static inline VTYPE VWRAP (estrin_17) (svbool_t pg, VTYPE x, VTYPE x2, |
127 | VTYPE x4, VTYPE x8, VTYPE x16, |
128 | const STYPE *poly) |
129 | { |
130 | VTYPE p16_17 = svmla_x (pg, DUP (poly[16]), x, poly[17]); |
131 | return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), p16_17, |
132 | x16); |
133 | } |
134 | static inline VTYPE VWRAP (estrin_18) (svbool_t pg, VTYPE x, VTYPE x2, |
135 | VTYPE x4, VTYPE x8, VTYPE x16, |
136 | const STYPE *poly) |
137 | { |
138 | VTYPE p16_17 = svmla_x (pg, DUP (poly[16]), x, poly[17]); |
139 | VTYPE p16_18 = svmla_x (pg, p16_17, x2, poly[18]); |
140 | return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), p16_18, |
141 | x16); |
142 | } |
143 | static inline VTYPE VWRAP (estrin_19) (svbool_t pg, VTYPE x, VTYPE x2, |
144 | VTYPE x4, VTYPE x8, VTYPE x16, |
145 | const STYPE *poly) |
146 | { |
147 | return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), |
148 | VWRAP (pairwise_poly_3) (pg, x, x2, poly: poly + 16), x16); |
149 | } |
150 | |
151 | static inline VTYPE VWRAP (horner_3) (svbool_t pg, VTYPE x, const STYPE *poly) |
152 | { |
153 | VTYPE p = svmla_x (pg, DUP (poly[2]), x, poly[3]); |
154 | p = svmad_x (pg, x, p, poly[1]); |
155 | p = svmad_x (pg, x, p, poly[0]); |
156 | return p; |
157 | } |
158 | static inline VTYPE VWRAP (horner_4) (svbool_t pg, VTYPE x, const STYPE *poly) |
159 | { |
160 | VTYPE p = svmla_x (pg, DUP (poly[3]), x, poly[4]); |
161 | p = svmad_x (pg, x, p, poly[2]); |
162 | p = svmad_x (pg, x, p, poly[1]); |
163 | p = svmad_x (pg, x, p, poly[0]); |
164 | return p; |
165 | } |
166 | static inline VTYPE VWRAP (horner_5) (svbool_t pg, VTYPE x, const STYPE *poly) |
167 | { |
168 | return svmad_x (pg, x, VWRAP (horner_4) (pg, x, poly: poly + 1), poly[0]); |
169 | } |
170 | static inline VTYPE VWRAP (horner_6) (svbool_t pg, VTYPE x, const STYPE *poly) |
171 | { |
172 | return svmad_x (pg, x, VWRAP (horner_5) (pg, x, poly: poly + 1), poly[0]); |
173 | } |
174 | static inline VTYPE VWRAP (horner_7) (svbool_t pg, VTYPE x, const STYPE *poly) |
175 | { |
176 | return svmad_x (pg, x, VWRAP (horner_6) (pg, x, poly: poly + 1), poly[0]); |
177 | } |
178 | static inline VTYPE VWRAP (horner_8) (svbool_t pg, VTYPE x, const STYPE *poly) |
179 | { |
180 | return svmad_x (pg, x, VWRAP (horner_7) (pg, x, poly: poly + 1), poly[0]); |
181 | } |
182 | static inline VTYPE VWRAP (horner_9) (svbool_t pg, VTYPE x, const STYPE *poly) |
183 | { |
184 | return svmad_x (pg, x, VWRAP (horner_8) (pg, x, poly: poly + 1), poly[0]); |
185 | } |
186 | static inline VTYPE |
187 | sv_horner_10_f32_x (svbool_t pg, VTYPE x, const STYPE *poly) |
188 | { |
189 | return svmad_x (pg, x, VWRAP (horner_9) (pg, x, poly: poly + 1), poly[0]); |
190 | } |
191 | static inline VTYPE |
192 | sv_horner_11_f32_x (svbool_t pg, VTYPE x, const STYPE *poly) |
193 | { |
194 | return svmad_x (pg, x, sv_horner_10_f32_x (pg, x, poly: poly + 1), poly[0]); |
195 | } |
196 | static inline VTYPE |
197 | sv_horner_12_f32_x (svbool_t pg, VTYPE x, const STYPE *poly) |
198 | { |
199 | return svmad_x (pg, x, sv_horner_11_f32_x (pg, x, poly: poly + 1), poly[0]); |
200 | } |
201 | |
202 | static inline VTYPE VWRAP (pw_horner_4) (svbool_t pg, VTYPE x, VTYPE x2, |
203 | const STYPE *poly) |
204 | { |
205 | VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); |
206 | VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]); |
207 | VTYPE p; |
208 | p = svmla_x (pg, p23, x2, poly[4]); |
209 | p = svmla_x (pg, p01, x2, p); |
210 | return p; |
211 | } |
212 | static inline VTYPE VWRAP (pw_horner_5) (svbool_t pg, VTYPE x, VTYPE x2, |
213 | const STYPE *poly) |
214 | { |
215 | VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); |
216 | VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]); |
217 | VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]); |
218 | VTYPE p; |
219 | p = svmla_x (pg, p23, x2, p45); |
220 | p = svmla_x (pg, p01, x2, p); |
221 | return p; |
222 | } |
223 | static inline VTYPE VWRAP (pw_horner_6) (svbool_t pg, VTYPE x, VTYPE x2, |
224 | const STYPE *poly) |
225 | { |
226 | VTYPE p26 = VWRAP (pw_horner_4) (pg, x, x2, poly: poly + 2); |
227 | VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); |
228 | return svmla_x (pg, p01, x2, p26); |
229 | } |
230 | static inline VTYPE VWRAP (pw_horner_7) (svbool_t pg, VTYPE x, VTYPE x2, |
231 | const STYPE *poly) |
232 | { |
233 | VTYPE p27 = VWRAP (pw_horner_5) (pg, x, x2, poly: poly + 2); |
234 | VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); |
235 | return svmla_x (pg, p01, x2, p27); |
236 | } |
237 | static inline VTYPE VWRAP (pw_horner_8) (svbool_t pg, VTYPE x, VTYPE x2, |
238 | const STYPE *poly) |
239 | { |
240 | VTYPE p28 = VWRAP (pw_horner_6) (pg, x, x2, poly: poly + 2); |
241 | VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); |
242 | return svmla_x (pg, p01, x2, p28); |
243 | } |
244 | static inline VTYPE VWRAP (pw_horner_9) (svbool_t pg, VTYPE x, VTYPE x2, |
245 | const STYPE *poly) |
246 | { |
247 | VTYPE p29 = VWRAP (pw_horner_7) (pg, x, x2, poly: poly + 2); |
248 | VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); |
249 | return svmla_x (pg, p01, x2, p29); |
250 | } |
251 | static inline VTYPE VWRAP (pw_horner_10) (svbool_t pg, VTYPE x, VTYPE x2, |
252 | const STYPE *poly) |
253 | { |
254 | VTYPE p2_10 = VWRAP (pw_horner_8) (pg, x, x2, poly: poly + 2); |
255 | VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); |
256 | return svmla_x (pg, p01, x2, p2_10); |
257 | } |
258 | static inline VTYPE VWRAP (pw_horner_11) (svbool_t pg, VTYPE x, VTYPE x2, |
259 | const STYPE *poly) |
260 | { |
261 | VTYPE p2_11 = VWRAP (pw_horner_9) (pg, x, x2, poly: poly + 2); |
262 | VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); |
263 | return svmla_x (pg, p01, x2, p2_11); |
264 | } |
265 | static inline VTYPE VWRAP (pw_horner_12) (svbool_t pg, VTYPE x, VTYPE x2, |
266 | const STYPE *poly) |
267 | { |
268 | VTYPE p2_12 = VWRAP (pw_horner_10) (pg, x, x2, poly: poly + 2); |
269 | VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); |
270 | return svmla_x (pg, p01, x2, p2_12); |
271 | } |
272 | static inline VTYPE VWRAP (pw_horner_13) (svbool_t pg, VTYPE x, VTYPE x2, |
273 | const STYPE *poly) |
274 | { |
275 | VTYPE p2_13 = VWRAP (pw_horner_11) (pg, x, x2, poly: poly + 2); |
276 | VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); |
277 | return svmla_x (pg, p01, x2, p2_13); |
278 | } |
279 | static inline VTYPE VWRAP (pw_horner_14) (svbool_t pg, VTYPE x, VTYPE x2, |
280 | const STYPE *poly) |
281 | { |
282 | VTYPE p2_14 = VWRAP (pw_horner_12) (pg, x, x2, poly: poly + 2); |
283 | VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); |
284 | return svmla_x (pg, p01, x2, p2_14); |
285 | } |
286 | static inline VTYPE VWRAP (pw_horner_15) (svbool_t pg, VTYPE x, VTYPE x2, |
287 | const STYPE *poly) |
288 | { |
289 | VTYPE p2_15 = VWRAP (pw_horner_13) (pg, x, x2, poly: poly + 2); |
290 | VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); |
291 | return svmla_x (pg, p01, x2, p2_15); |
292 | } |
293 | static inline VTYPE VWRAP (pw_horner_16) (svbool_t pg, VTYPE x, VTYPE x2, |
294 | const STYPE *poly) |
295 | { |
296 | VTYPE p2_16 = VWRAP (pw_horner_14) (pg, x, x2, poly: poly + 2); |
297 | VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); |
298 | return svmla_x (pg, p01, x2, p2_16); |
299 | } |
300 | static inline VTYPE VWRAP (pw_horner_17) (svbool_t pg, VTYPE x, VTYPE x2, |
301 | const STYPE *poly) |
302 | { |
303 | VTYPE p2_17 = VWRAP (pw_horner_15) (pg, x, x2, poly: poly + 2); |
304 | VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); |
305 | return svmla_x (pg, p01, x2, p2_17); |
306 | } |
307 | static inline VTYPE VWRAP (pw_horner_18) (svbool_t pg, VTYPE x, VTYPE x2, |
308 | const STYPE *poly) |
309 | { |
310 | VTYPE p2_18 = VWRAP (pw_horner_16) (pg, x, x2, poly: poly + 2); |
311 | VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); |
312 | return svmla_x (pg, p01, x2, p2_18); |
313 | } |
314 | |