| 1 | /* Utilities for Advanced SIMD libmvec routines. |
| 2 | Copyright (C) 2023-2024 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. |
| 4 | |
| 5 | The GNU C Library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License as published by the Free Software Foundation; either |
| 8 | version 2.1 of the License, or (at your option) any later version. |
| 9 | |
| 10 | The GNU C Library is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | Lesser General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU Lesser General Public |
| 16 | License along with the GNU C Library; if not, see |
| 17 | <https://www.gnu.org/licenses/>. */ |
| 18 | |
| 19 | #ifndef _V_MATH_H |
| 20 | #define _V_MATH_H |
| 21 | |
| 22 | #include <arm_neon.h> |
| 23 | #include "vecmath_config.h" |
| 24 | |
| 25 | #define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) |
| 26 | |
| 27 | #define V_NAME_F1(fun) _ZGVnN4v_##fun##f |
| 28 | #define V_NAME_D1(fun) _ZGVnN2v_##fun |
| 29 | #define V_NAME_F2(fun) _ZGVnN4vv_##fun##f |
| 30 | #define V_NAME_D2(fun) _ZGVnN2vv_##fun |
| 31 | |
| 32 | #include "advsimd_f32_protos.h" |
| 33 | |
| 34 | #define HALF_WIDTH_ALIAS_F1(fun) \ |
| 35 | float32x2_t VPCS_ATTR _ZGVnN2v_##fun##f (float32x2_t x) \ |
| 36 | { \ |
| 37 | return vget_low_f32 (_ZGVnN4v_##fun##f (vcombine_f32 (x, x))); \ |
| 38 | } |
| 39 | |
| 40 | #define HALF_WIDTH_ALIAS_F2(fun) \ |
| 41 | float32x2_t VPCS_ATTR _ZGVnN2vv_##fun##f (float32x2_t x, float32x2_t y) \ |
| 42 | { \ |
| 43 | return vget_low_f32 ( \ |
| 44 | _ZGVnN4vv_##fun##f (vcombine_f32 (x, x), vcombine_f32 (y, y))); \ |
| 45 | } |
| 46 | |
| 47 | /* Shorthand helpers for declaring constants. */ |
| 48 | #define V2(X) { X, X } |
| 49 | #define V4(X) { X, X, X, X } |
| 50 | #define V8(X) { X, X, X, X, X, X, X, X } |
| 51 | |
| 52 | static inline int |
| 53 | v_any_u16h (uint16x4_t x) |
| 54 | { |
| 55 | return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0; |
| 56 | } |
| 57 | |
| 58 | static inline float32x4_t |
| 59 | v_f32 (float x) |
| 60 | { |
| 61 | return (float32x4_t) V4 (x); |
| 62 | } |
| 63 | static inline uint32x4_t |
| 64 | v_u32 (uint32_t x) |
| 65 | { |
| 66 | return (uint32x4_t) V4 (x); |
| 67 | } |
| 68 | static inline int32x4_t |
| 69 | v_s32 (int32_t x) |
| 70 | { |
| 71 | return (int32x4_t) V4 (x); |
| 72 | } |
| 73 | |
| 74 | /* true if any elements of a vector compare result is non-zero. */ |
| 75 | static inline int |
| 76 | v_any_u32 (uint32x4_t x) |
| 77 | { |
| 78 | /* assume elements in x are either 0 or -1u. */ |
| 79 | return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; |
| 80 | } |
| 81 | static inline int |
| 82 | v_any_u32h (uint32x2_t x) |
| 83 | { |
| 84 | return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0; |
| 85 | } |
| 86 | static inline float32x4_t |
| 87 | v_lookup_f32 (const float *tab, uint32x4_t idx) |
| 88 | { |
| 89 | return (float32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] }; |
| 90 | } |
| 91 | static inline uint32x4_t |
| 92 | v_lookup_u32 (const uint32_t *tab, uint32x4_t idx) |
| 93 | { |
| 94 | return (uint32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] }; |
| 95 | } |
| 96 | static inline float32x4_t |
| 97 | v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p) |
| 98 | { |
| 99 | return (float32x4_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], |
| 100 | p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3] }; |
| 101 | } |
| 102 | static inline float32x4_t |
| 103 | v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2, |
| 104 | float32x4_t y, uint32x4_t p) |
| 105 | { |
| 106 | return (float32x4_t){ p[0] ? f (x1[0], x2[0]) : y[0], |
| 107 | p[1] ? f (x1[1], x2[1]) : y[1], |
| 108 | p[2] ? f (x1[2], x2[2]) : y[2], |
| 109 | p[3] ? f (x1[3], x2[3]) : y[3] }; |
| 110 | } |
| 111 | |
| 112 | static inline float64x2_t |
| 113 | v_f64 (double x) |
| 114 | { |
| 115 | return (float64x2_t) V2 (x); |
| 116 | } |
| 117 | static inline uint64x2_t |
| 118 | v_u64 (uint64_t x) |
| 119 | { |
| 120 | return (uint64x2_t) V2 (x); |
| 121 | } |
| 122 | static inline int64x2_t |
| 123 | v_s64 (int64_t x) |
| 124 | { |
| 125 | return (int64x2_t) V2 (x); |
| 126 | } |
| 127 | |
| 128 | /* true if any elements of a vector compare result is non-zero. */ |
| 129 | static inline int |
| 130 | v_any_u64 (uint64x2_t x) |
| 131 | { |
| 132 | /* assume elements in x are either 0 or -1u. */ |
| 133 | return vpaddd_u64 (x) != 0; |
| 134 | } |
| 135 | /* true if all elements of a vector compare result is 1. */ |
| 136 | static inline int |
| 137 | v_all_u64 (uint64x2_t x) |
| 138 | { |
| 139 | /* assume elements in x are either 0 or -1u. */ |
| 140 | return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2; |
| 141 | } |
| 142 | static inline float64x2_t |
| 143 | v_lookup_f64 (const double *tab, uint64x2_t idx) |
| 144 | { |
| 145 | return (float64x2_t){ tab[idx[0]], tab[idx[1]] }; |
| 146 | } |
| 147 | static inline uint64x2_t |
| 148 | v_lookup_u64 (const uint64_t *tab, uint64x2_t idx) |
| 149 | { |
| 150 | return (uint64x2_t){ tab[idx[0]], tab[idx[1]] }; |
| 151 | } |
| 152 | static inline float64x2_t |
| 153 | v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p) |
| 154 | { |
| 155 | double p1 = p[1]; |
| 156 | double x1 = x[1]; |
| 157 | if (__glibc_likely (p[0])) |
| 158 | y[0] = f (x[0]); |
| 159 | if (__glibc_likely (p1)) |
| 160 | y[1] = f (x1); |
| 161 | return y; |
| 162 | } |
| 163 | static inline float64x2_t |
| 164 | v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2, |
| 165 | float64x2_t y, uint64x2_t p) |
| 166 | { |
| 167 | return (float64x2_t){ p[0] ? f (x1[0], x2[0]) : y[0], |
| 168 | p[1] ? f (x1[1], x2[1]) : y[1] }; |
| 169 | } |
| 170 | |
| 171 | #endif |
| 172 | |