1 | /* Utilities for Advanced SIMD libmvec routines. |
2 | Copyright (C) 2023-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #ifndef _V_MATH_H |
20 | #define _V_MATH_H |
21 | |
22 | #include <arm_neon.h> |
23 | #include "vecmath_config.h" |
24 | |
25 | #define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) |
26 | |
27 | #define V_NAME_F1(fun) _ZGVnN4v_##fun##f |
28 | #define V_NAME_D1(fun) _ZGVnN2v_##fun |
29 | #define V_NAME_F2(fun) _ZGVnN4vv_##fun##f |
30 | #define V_NAME_D2(fun) _ZGVnN2vv_##fun |
31 | |
32 | #include "advsimd_f32_protos.h" |
33 | |
34 | #define HALF_WIDTH_ALIAS_F1(fun) \ |
35 | float32x2_t VPCS_ATTR _ZGVnN2v_##fun##f (float32x2_t x) \ |
36 | { \ |
37 | return vget_low_f32 (_ZGVnN4v_##fun##f (vcombine_f32 (x, x))); \ |
38 | } |
39 | |
40 | #define HALF_WIDTH_ALIAS_F2(fun) \ |
41 | float32x2_t VPCS_ATTR _ZGVnN2vv_##fun##f (float32x2_t x, float32x2_t y) \ |
42 | { \ |
43 | return vget_low_f32 ( \ |
44 | _ZGVnN4vv_##fun##f (vcombine_f32 (x, x), vcombine_f32 (y, y))); \ |
45 | } |
46 | |
47 | /* Shorthand helpers for declaring constants. */ |
48 | #define V2(X) { X, X } |
49 | #define V4(X) { X, X, X, X } |
50 | #define V8(X) { X, X, X, X, X, X, X, X } |
51 | |
52 | static inline int |
53 | v_any_u16h (uint16x4_t x) |
54 | { |
55 | return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0; |
56 | } |
57 | |
58 | static inline float32x4_t |
59 | v_f32 (float x) |
60 | { |
61 | return (float32x4_t) V4 (x); |
62 | } |
63 | static inline uint32x4_t |
64 | v_u32 (uint32_t x) |
65 | { |
66 | return (uint32x4_t) V4 (x); |
67 | } |
68 | static inline int32x4_t |
69 | v_s32 (int32_t x) |
70 | { |
71 | return (int32x4_t) V4 (x); |
72 | } |
73 | |
74 | /* true if any elements of a vector compare result is non-zero. */ |
75 | static inline int |
76 | v_any_u32 (uint32x4_t x) |
77 | { |
78 | /* assume elements in x are either 0 or -1u. */ |
79 | return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; |
80 | } |
81 | static inline int |
82 | v_any_u32h (uint32x2_t x) |
83 | { |
84 | return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0; |
85 | } |
86 | static inline float32x4_t |
87 | v_lookup_f32 (const float *tab, uint32x4_t idx) |
88 | { |
89 | return (float32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] }; |
90 | } |
91 | static inline uint32x4_t |
92 | v_lookup_u32 (const uint32_t *tab, uint32x4_t idx) |
93 | { |
94 | return (uint32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] }; |
95 | } |
96 | static inline float32x4_t |
97 | v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p) |
98 | { |
99 | return (float32x4_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], |
100 | p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3] }; |
101 | } |
102 | static inline float32x4_t |
103 | v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2, |
104 | float32x4_t y, uint32x4_t p) |
105 | { |
106 | return (float32x4_t){ p[0] ? f (x1[0], x2[0]) : y[0], |
107 | p[1] ? f (x1[1], x2[1]) : y[1], |
108 | p[2] ? f (x1[2], x2[2]) : y[2], |
109 | p[3] ? f (x1[3], x2[3]) : y[3] }; |
110 | } |
111 | |
112 | static inline float64x2_t |
113 | v_f64 (double x) |
114 | { |
115 | return (float64x2_t) V2 (x); |
116 | } |
117 | static inline uint64x2_t |
118 | v_u64 (uint64_t x) |
119 | { |
120 | return (uint64x2_t) V2 (x); |
121 | } |
122 | static inline int64x2_t |
123 | v_s64 (int64_t x) |
124 | { |
125 | return (int64x2_t) V2 (x); |
126 | } |
127 | |
128 | /* true if any elements of a vector compare result is non-zero. */ |
129 | static inline int |
130 | v_any_u64 (uint64x2_t x) |
131 | { |
132 | /* assume elements in x are either 0 or -1u. */ |
133 | return vpaddd_u64 (x) != 0; |
134 | } |
135 | /* true if all elements of a vector compare result is 1. */ |
136 | static inline int |
137 | v_all_u64 (uint64x2_t x) |
138 | { |
139 | /* assume elements in x are either 0 or -1u. */ |
140 | return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2; |
141 | } |
142 | static inline float64x2_t |
143 | v_lookup_f64 (const double *tab, uint64x2_t idx) |
144 | { |
145 | return (float64x2_t){ tab[idx[0]], tab[idx[1]] }; |
146 | } |
147 | static inline uint64x2_t |
148 | v_lookup_u64 (const uint64_t *tab, uint64x2_t idx) |
149 | { |
150 | return (uint64x2_t){ tab[idx[0]], tab[idx[1]] }; |
151 | } |
152 | static inline float64x2_t |
153 | v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p) |
154 | { |
155 | double p1 = p[1]; |
156 | double x1 = x[1]; |
157 | if (__glibc_likely (p[0])) |
158 | y[0] = f (x[0]); |
159 | if (__glibc_likely (p1)) |
160 | y[1] = f (x1); |
161 | return y; |
162 | } |
163 | static inline float64x2_t |
164 | v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2, |
165 | float64x2_t y, uint64x2_t p) |
166 | { |
167 | return (float64x2_t){ p[0] ? f (x1[0], x2[0]) : y[0], |
168 | p[1] ? f (x1[1], x2[1]) : y[1] }; |
169 | } |
170 | |
171 | #endif |
172 | |