1 | /* |
2 | * Vector math abstractions. |
3 | * |
4 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
5 | * See https://llvm.org/LICENSE.txt for license information. |
6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
7 | */ |
8 | |
9 | #ifndef _V_MATH_H |
10 | #define _V_MATH_H |
11 | |
12 | #ifndef WANT_VMATH |
13 | /* Enable the build of vector math code. */ |
14 | # define WANT_VMATH 1 |
15 | #endif |
16 | #if WANT_VMATH |
17 | |
18 | /* The goal of this header is to allow vector and scalar |
19 | build of the same algorithm, the provided intrinsic |
20 | wrappers are also vector length agnostic so they can |
21 | be implemented for SVE too (or other simd architectures) |
22 | and then the code should work on those targets too. */ |
23 | |
24 | #if SCALAR |
25 | #define V_NAME(x) __s_##x |
26 | #elif VPCS && __aarch64__ |
27 | #define V_NAME(x) __vn_##x |
28 | #define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) |
29 | #else |
30 | #define V_NAME(x) __v_##x |
31 | #endif |
32 | |
33 | #ifndef VPCS_ATTR |
34 | #define VPCS_ATTR |
35 | #endif |
36 | #ifndef VPCS_ALIAS |
37 | #define VPCS_ALIAS |
38 | #endif |
39 | |
40 | #include <stdint.h> |
41 | #include "math_config.h" |
42 | |
43 | typedef float f32_t; |
44 | typedef uint32_t u32_t; |
45 | typedef int32_t s32_t; |
46 | typedef double f64_t; |
47 | typedef uint64_t u64_t; |
48 | typedef int64_t s64_t; |
49 | |
50 | /* reinterpret as type1 from type2. */ |
51 | static inline u32_t |
52 | as_u32_f32 (f32_t x) |
53 | { |
54 | union { f32_t f; u32_t u; } r = {.f: x}; |
55 | return r.u; |
56 | } |
57 | static inline f32_t |
58 | as_f32_u32 (u32_t x) |
59 | { |
60 | union { u32_t u; f32_t f; } r = {.u: x}; |
61 | return r.f; |
62 | } |
63 | static inline s32_t |
64 | as_s32_u32 (u32_t x) |
65 | { |
66 | union { u32_t u; s32_t i; } r = {.u: x}; |
67 | return r.i; |
68 | } |
69 | static inline u32_t |
70 | as_u32_s32 (s32_t x) |
71 | { |
72 | union { s32_t i; u32_t u; } r = {.i: x}; |
73 | return r.u; |
74 | } |
75 | static inline u64_t |
76 | as_u64_f64 (f64_t x) |
77 | { |
78 | union { f64_t f; u64_t u; } r = {.f: x}; |
79 | return r.u; |
80 | } |
81 | static inline f64_t |
82 | as_f64_u64 (u64_t x) |
83 | { |
84 | union { u64_t u; f64_t f; } r = {.u: x}; |
85 | return r.f; |
86 | } |
87 | static inline s64_t |
88 | as_s64_u64 (u64_t x) |
89 | { |
90 | union { u64_t u; s64_t i; } r = {.u: x}; |
91 | return r.i; |
92 | } |
93 | static inline u64_t |
94 | as_u64_s64 (s64_t x) |
95 | { |
96 | union { s64_t i; u64_t u; } r = {.i: x}; |
97 | return r.u; |
98 | } |
99 | |
100 | #if SCALAR |
101 | #define V_SUPPORTED 1 |
102 | typedef f32_t v_f32_t; |
103 | typedef u32_t v_u32_t; |
104 | typedef s32_t v_s32_t; |
105 | typedef f64_t v_f64_t; |
106 | typedef u64_t v_u64_t; |
107 | typedef s64_t v_s64_t; |
108 | |
109 | static inline int |
110 | v_lanes32 (void) |
111 | { |
112 | return 1; |
113 | } |
114 | |
115 | static inline v_f32_t |
116 | v_f32 (f32_t x) |
117 | { |
118 | return x; |
119 | } |
120 | static inline v_u32_t |
121 | v_u32 (u32_t x) |
122 | { |
123 | return x; |
124 | } |
125 | static inline v_s32_t |
126 | v_s32 (s32_t x) |
127 | { |
128 | return x; |
129 | } |
130 | |
131 | static inline f32_t |
132 | v_get_f32 (v_f32_t x, int i) |
133 | { |
134 | return x; |
135 | } |
136 | static inline u32_t |
137 | v_get_u32 (v_u32_t x, int i) |
138 | { |
139 | return x; |
140 | } |
141 | static inline s32_t |
142 | v_get_s32 (v_s32_t x, int i) |
143 | { |
144 | return x; |
145 | } |
146 | |
147 | static inline void |
148 | v_set_f32 (v_f32_t *x, int i, f32_t v) |
149 | { |
150 | *x = v; |
151 | } |
152 | static inline void |
153 | v_set_u32 (v_u32_t *x, int i, u32_t v) |
154 | { |
155 | *x = v; |
156 | } |
157 | static inline void |
158 | v_set_s32 (v_s32_t *x, int i, s32_t v) |
159 | { |
160 | *x = v; |
161 | } |
162 | |
163 | /* true if any elements of a v_cond result is non-zero. */ |
164 | static inline int |
165 | v_any_u32 (v_u32_t x) |
166 | { |
167 | return x != 0; |
168 | } |
169 | /* to wrap the result of relational operators. */ |
170 | static inline v_u32_t |
171 | v_cond_u32 (v_u32_t x) |
172 | { |
173 | return x ? -1 : 0; |
174 | } |
175 | static inline v_f32_t |
176 | v_abs_f32 (v_f32_t x) |
177 | { |
178 | return __builtin_fabsf (x); |
179 | } |
180 | static inline v_f32_t |
181 | v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) |
182 | { |
183 | return __builtin_fmaf (x, y, z); |
184 | } |
185 | static inline v_f32_t |
186 | v_round_f32 (v_f32_t x) |
187 | { |
188 | return __builtin_roundf (x); |
189 | } |
190 | static inline v_s32_t |
191 | v_round_s32 (v_f32_t x) |
192 | { |
193 | return __builtin_lroundf (x); /* relies on -fno-math-errno. */ |
194 | } |
195 | /* convert to type1 from type2. */ |
196 | static inline v_f32_t |
197 | v_to_f32_s32 (v_s32_t x) |
198 | { |
199 | return x; |
200 | } |
201 | static inline v_f32_t |
202 | v_to_f32_u32 (v_u32_t x) |
203 | { |
204 | return x; |
205 | } |
206 | /* reinterpret as type1 from type2. */ |
207 | static inline v_u32_t |
208 | v_as_u32_f32 (v_f32_t x) |
209 | { |
210 | union { v_f32_t f; v_u32_t u; } r = {.f: x}; |
211 | return r.u; |
212 | } |
213 | static inline v_f32_t |
214 | v_as_f32_u32 (v_u32_t x) |
215 | { |
216 | union { v_u32_t u; v_f32_t f; } r = {.u: x}; |
217 | return r.f; |
218 | } |
219 | static inline v_s32_t |
220 | v_as_s32_u32 (v_u32_t x) |
221 | { |
222 | union { v_u32_t u; v_s32_t i; } r = {.u: x}; |
223 | return r.i; |
224 | } |
225 | static inline v_u32_t |
226 | v_as_u32_s32 (v_s32_t x) |
227 | { |
228 | union { v_s32_t i; v_u32_t u; } r = {.i: x}; |
229 | return r.u; |
230 | } |
231 | static inline v_f32_t |
232 | v_lookup_f32 (const f32_t *tab, v_u32_t idx) |
233 | { |
234 | return tab[idx]; |
235 | } |
236 | static inline v_u32_t |
237 | v_lookup_u32 (const u32_t *tab, v_u32_t idx) |
238 | { |
239 | return tab[idx]; |
240 | } |
241 | static inline v_f32_t |
242 | v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p) |
243 | { |
244 | return f (x); |
245 | } |
246 | static inline v_f32_t |
247 | v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, |
248 | v_u32_t p) |
249 | { |
250 | return f (x1, x2); |
251 | } |
252 | |
253 | static inline int |
254 | v_lanes64 (void) |
255 | { |
256 | return 1; |
257 | } |
258 | static inline v_f64_t |
259 | v_f64 (f64_t x) |
260 | { |
261 | return x; |
262 | } |
263 | static inline v_u64_t |
264 | v_u64 (u64_t x) |
265 | { |
266 | return x; |
267 | } |
268 | static inline v_s64_t |
269 | v_s64 (s64_t x) |
270 | { |
271 | return x; |
272 | } |
273 | static inline f64_t |
274 | v_get_f64 (v_f64_t x, int i) |
275 | { |
276 | return x; |
277 | } |
278 | static inline void |
279 | v_set_f64 (v_f64_t *x, int i, f64_t v) |
280 | { |
281 | *x = v; |
282 | } |
283 | /* true if any elements of a v_cond result is non-zero. */ |
284 | static inline int |
285 | v_any_u64 (v_u64_t x) |
286 | { |
287 | return x != 0; |
288 | } |
289 | /* to wrap the result of relational operators. */ |
290 | static inline v_u64_t |
291 | v_cond_u64 (v_u64_t x) |
292 | { |
293 | return x ? -1 : 0; |
294 | } |
295 | static inline v_f64_t |
296 | v_abs_f64 (v_f64_t x) |
297 | { |
298 | return __builtin_fabs (x); |
299 | } |
300 | static inline v_f64_t |
301 | v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z) |
302 | { |
303 | return __builtin_fma (x, y, z); |
304 | } |
305 | static inline v_f64_t |
306 | v_round_f64 (v_f64_t x) |
307 | { |
308 | return __builtin_round (x); |
309 | } |
310 | static inline v_s64_t |
311 | v_round_s64 (v_f64_t x) |
312 | { |
313 | return __builtin_lround (x); /* relies on -fno-math-errno. */ |
314 | } |
315 | /* convert to type1 from type2. */ |
316 | static inline v_f64_t |
317 | v_to_f64_s64 (v_s64_t x) |
318 | { |
319 | return x; |
320 | } |
321 | static inline v_f64_t |
322 | v_to_f64_u64 (v_u64_t x) |
323 | { |
324 | return x; |
325 | } |
326 | /* reinterpret as type1 from type2. */ |
327 | static inline v_u64_t |
328 | v_as_u64_f64 (v_f64_t x) |
329 | { |
330 | union { v_f64_t f; v_u64_t u; } r = {.f: x}; |
331 | return r.u; |
332 | } |
333 | static inline v_f64_t |
334 | v_as_f64_u64 (v_u64_t x) |
335 | { |
336 | union { v_u64_t u; v_f64_t f; } r = {.u: x}; |
337 | return r.f; |
338 | } |
339 | static inline v_s64_t |
340 | v_as_s64_u64 (v_u64_t x) |
341 | { |
342 | union { v_u64_t u; v_s64_t i; } r = {.u: x}; |
343 | return r.i; |
344 | } |
345 | static inline v_u64_t |
346 | v_as_u64_s64 (v_s64_t x) |
347 | { |
348 | union { v_s64_t i; v_u64_t u; } r = {.i: x}; |
349 | return r.u; |
350 | } |
351 | static inline v_f64_t |
352 | v_lookup_f64 (const f64_t *tab, v_u64_t idx) |
353 | { |
354 | return tab[idx]; |
355 | } |
356 | static inline v_u64_t |
357 | v_lookup_u64 (const u64_t *tab, v_u64_t idx) |
358 | { |
359 | return tab[idx]; |
360 | } |
361 | static inline v_f64_t |
362 | v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p) |
363 | { |
364 | return f (x); |
365 | } |
366 | |
367 | #elif __aarch64__ |
368 | #define V_SUPPORTED 1 |
369 | #include <arm_neon.h> |
370 | typedef float32x4_t v_f32_t; |
371 | typedef uint32x4_t v_u32_t; |
372 | typedef int32x4_t v_s32_t; |
373 | typedef float64x2_t v_f64_t; |
374 | typedef uint64x2_t v_u64_t; |
375 | typedef int64x2_t v_s64_t; |
376 | |
377 | static inline int |
378 | v_lanes32 (void) |
379 | { |
380 | return 4; |
381 | } |
382 | |
383 | static inline v_f32_t |
384 | v_f32 (f32_t x) |
385 | { |
386 | return (v_f32_t){x, x, x, x}; |
387 | } |
388 | static inline v_u32_t |
389 | v_u32 (u32_t x) |
390 | { |
391 | return (v_u32_t){x, x, x, x}; |
392 | } |
393 | static inline v_s32_t |
394 | v_s32 (s32_t x) |
395 | { |
396 | return (v_s32_t){x, x, x, x}; |
397 | } |
398 | |
399 | static inline f32_t |
400 | v_get_f32 (v_f32_t x, int i) |
401 | { |
402 | return x[i]; |
403 | } |
404 | static inline u32_t |
405 | v_get_u32 (v_u32_t x, int i) |
406 | { |
407 | return x[i]; |
408 | } |
409 | static inline s32_t |
410 | v_get_s32 (v_s32_t x, int i) |
411 | { |
412 | return x[i]; |
413 | } |
414 | |
415 | static inline void |
416 | v_set_f32 (v_f32_t *x, int i, f32_t v) |
417 | { |
418 | (*x)[i] = v; |
419 | } |
420 | static inline void |
421 | v_set_u32 (v_u32_t *x, int i, u32_t v) |
422 | { |
423 | (*x)[i] = v; |
424 | } |
425 | static inline void |
426 | v_set_s32 (v_s32_t *x, int i, s32_t v) |
427 | { |
428 | (*x)[i] = v; |
429 | } |
430 | |
431 | /* true if any elements of a v_cond result is non-zero. */ |
432 | static inline int |
433 | v_any_u32 (v_u32_t x) |
434 | { |
435 | /* assume elements in x are either 0 or -1u. */ |
436 | return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; |
437 | } |
438 | /* to wrap the result of relational operators. */ |
439 | static inline v_u32_t |
440 | v_cond_u32 (v_u32_t x) |
441 | { |
442 | return x; |
443 | } |
444 | static inline v_f32_t |
445 | v_abs_f32 (v_f32_t x) |
446 | { |
447 | return vabsq_f32 (x); |
448 | } |
449 | static inline v_f32_t |
450 | v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) |
451 | { |
452 | return vfmaq_f32 (z, x, y); |
453 | } |
454 | static inline v_f32_t |
455 | v_round_f32 (v_f32_t x) |
456 | { |
457 | return vrndaq_f32 (x); |
458 | } |
459 | static inline v_s32_t |
460 | v_round_s32 (v_f32_t x) |
461 | { |
462 | return vcvtaq_s32_f32 (x); |
463 | } |
464 | /* convert to type1 from type2. */ |
465 | static inline v_f32_t |
466 | v_to_f32_s32 (v_s32_t x) |
467 | { |
468 | return (v_f32_t){x[0], x[1], x[2], x[3]}; |
469 | } |
470 | static inline v_f32_t |
471 | v_to_f32_u32 (v_u32_t x) |
472 | { |
473 | return (v_f32_t){x[0], x[1], x[2], x[3]}; |
474 | } |
475 | /* reinterpret as type1 from type2. */ |
476 | static inline v_u32_t |
477 | v_as_u32_f32 (v_f32_t x) |
478 | { |
479 | union { v_f32_t f; v_u32_t u; } r = {x}; |
480 | return r.u; |
481 | } |
482 | static inline v_f32_t |
483 | v_as_f32_u32 (v_u32_t x) |
484 | { |
485 | union { v_u32_t u; v_f32_t f; } r = {x}; |
486 | return r.f; |
487 | } |
488 | static inline v_s32_t |
489 | v_as_s32_u32 (v_u32_t x) |
490 | { |
491 | union { v_u32_t u; v_s32_t i; } r = {x}; |
492 | return r.i; |
493 | } |
494 | static inline v_u32_t |
495 | v_as_u32_s32 (v_s32_t x) |
496 | { |
497 | union { v_s32_t i; v_u32_t u; } r = {x}; |
498 | return r.u; |
499 | } |
500 | static inline v_f32_t |
501 | v_lookup_f32 (const f32_t *tab, v_u32_t idx) |
502 | { |
503 | return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; |
504 | } |
505 | static inline v_u32_t |
506 | v_lookup_u32 (const u32_t *tab, v_u32_t idx) |
507 | { |
508 | return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; |
509 | } |
510 | static inline v_f32_t |
511 | v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p) |
512 | { |
513 | return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], |
514 | p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]}; |
515 | } |
516 | static inline v_f32_t |
517 | v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, |
518 | v_u32_t p) |
519 | { |
520 | return ( |
521 | v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1], |
522 | p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]}; |
523 | } |
524 | |
525 | static inline int |
526 | v_lanes64 (void) |
527 | { |
528 | return 2; |
529 | } |
530 | static inline v_f64_t |
531 | v_f64 (f64_t x) |
532 | { |
533 | return (v_f64_t){x, x}; |
534 | } |
535 | static inline v_u64_t |
536 | v_u64 (u64_t x) |
537 | { |
538 | return (v_u64_t){x, x}; |
539 | } |
540 | static inline v_s64_t |
541 | v_s64 (s64_t x) |
542 | { |
543 | return (v_s64_t){x, x}; |
544 | } |
545 | static inline f64_t |
546 | v_get_f64 (v_f64_t x, int i) |
547 | { |
548 | return x[i]; |
549 | } |
550 | static inline void |
551 | v_set_f64 (v_f64_t *x, int i, f64_t v) |
552 | { |
553 | (*x)[i] = v; |
554 | } |
555 | /* true if any elements of a v_cond result is non-zero. */ |
556 | static inline int |
557 | v_any_u64 (v_u64_t x) |
558 | { |
559 | /* assume elements in x are either 0 or -1u. */ |
560 | return vpaddd_u64 (x) != 0; |
561 | } |
562 | /* to wrap the result of relational operators. */ |
563 | static inline v_u64_t |
564 | v_cond_u64 (v_u64_t x) |
565 | { |
566 | return x; |
567 | } |
568 | static inline v_f64_t |
569 | v_abs_f64 (v_f64_t x) |
570 | { |
571 | return vabsq_f64 (x); |
572 | } |
573 | static inline v_f64_t |
574 | v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z) |
575 | { |
576 | return vfmaq_f64 (z, x, y); |
577 | } |
578 | static inline v_f64_t |
579 | v_round_f64 (v_f64_t x) |
580 | { |
581 | return vrndaq_f64 (x); |
582 | } |
583 | static inline v_s64_t |
584 | v_round_s64 (v_f64_t x) |
585 | { |
586 | return vcvtaq_s64_f64 (x); |
587 | } |
588 | /* convert to type1 from type2. */ |
589 | static inline v_f64_t |
590 | v_to_f64_s64 (v_s64_t x) |
591 | { |
592 | return (v_f64_t){x[0], x[1]}; |
593 | } |
594 | static inline v_f64_t |
595 | v_to_f64_u64 (v_u64_t x) |
596 | { |
597 | return (v_f64_t){x[0], x[1]}; |
598 | } |
599 | /* reinterpret as type1 from type2. */ |
600 | static inline v_u64_t |
601 | v_as_u64_f64 (v_f64_t x) |
602 | { |
603 | union { v_f64_t f; v_u64_t u; } r = {x}; |
604 | return r.u; |
605 | } |
606 | static inline v_f64_t |
607 | v_as_f64_u64 (v_u64_t x) |
608 | { |
609 | union { v_u64_t u; v_f64_t f; } r = {x}; |
610 | return r.f; |
611 | } |
612 | static inline v_s64_t |
613 | v_as_s64_u64 (v_u64_t x) |
614 | { |
615 | union { v_u64_t u; v_s64_t i; } r = {x}; |
616 | return r.i; |
617 | } |
618 | static inline v_u64_t |
619 | v_as_u64_s64 (v_s64_t x) |
620 | { |
621 | union { v_s64_t i; v_u64_t u; } r = {x}; |
622 | return r.u; |
623 | } |
624 | static inline v_f64_t |
625 | v_lookup_f64 (const f64_t *tab, v_u64_t idx) |
626 | { |
627 | return (v_f64_t){tab[idx[0]], tab[idx[1]]}; |
628 | } |
629 | static inline v_u64_t |
630 | v_lookup_u64 (const u64_t *tab, v_u64_t idx) |
631 | { |
632 | return (v_u64_t){tab[idx[0]], tab[idx[1]]}; |
633 | } |
634 | static inline v_f64_t |
635 | v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p) |
636 | { |
637 | return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]}; |
638 | } |
639 | #endif |
640 | |
641 | #endif |
642 | #endif |
643 | |