Warning: This file is not a C or C++ file. It does not have highlighting.
1 | //===-- aarch64 implementation of memory function building blocks ---------===// |
---|---|
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file provides aarch64 specific building blocks to compose memory |
10 | // functions. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H |
14 | #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H |
15 | |
16 | #include "src/__support/macros/attributes.h" // LIBC_INLINE |
17 | #include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL |
18 | #include "src/__support/macros/properties/architectures.h" |
19 | |
20 | #if defined(LIBC_TARGET_ARCH_IS_AARCH64) |
21 | |
22 | #include "src/__support/CPP/type_traits.h" // cpp::always_false |
23 | #include "src/__support/common.h" |
24 | #include "src/string/memory_utils/op_generic.h" |
25 | |
26 | #ifdef __ARM_NEON |
27 | #include <arm_neon.h> |
28 | |
29 | namespace LIBC_NAMESPACE_DECL { |
30 | namespace aarch64 { |
31 | |
32 | LIBC_INLINE_VAR constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON); |
33 | |
34 | namespace neon { |
35 | |
36 | struct BzeroCacheLine { |
37 | static constexpr size_t SIZE = 64; |
38 | |
39 | LIBC_INLINE static void block(Ptr dst, uint8_t) { |
40 | #if __SIZEOF_POINTER__ == 4 |
41 | asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory"); |
42 | #else |
43 | asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory"); |
44 | #endif |
45 | } |
46 | |
47 | LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) { |
48 | size_t offset = 0; |
49 | do { |
50 | block(dst + offset, value); |
51 | offset += SIZE; |
52 | } while (offset < count - SIZE); |
53 | // Unaligned store, we can't use 'dc zva' here. |
54 | generic::Memset<generic_v512>::tail(dst, value, count); |
55 | } |
56 | }; |
57 | |
58 | LIBC_INLINE bool hasZva() { |
59 | uint64_t zva_val; |
60 | asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val)); |
61 | // DC ZVA is permitted if DZP, bit [4] is zero. |
62 | // BS, bits [3:0] is log2 of the block count in words. |
63 | // So the next line checks whether the instruction is permitted and block |
64 | // count is 16 words (i.e. 64 bytes). |
65 | return (zva_val & 0b11111) == 0b00100; |
66 | } |
67 | |
68 | } // namespace neon |
69 | |
70 | /////////////////////////////////////////////////////////////////////////////// |
71 | // Bcmp |
72 | template <size_t Size> struct Bcmp { |
73 | static constexpr size_t SIZE = Size; |
74 | static constexpr size_t BlockSize = 32; |
75 | |
76 | LIBC_INLINE static const unsigned char *as_u8(CPtr ptr) { |
77 | return reinterpret_cast<const unsigned char *>(ptr); |
78 | } |
79 | |
80 | LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) { |
81 | if constexpr (Size == 16) { |
82 | auto _p1 = as_u8(p1); |
83 | auto _p2 = as_u8(p2); |
84 | uint8x16_t a = vld1q_u8(_p1); |
85 | uint8x16_t n = vld1q_u8(_p2); |
86 | uint8x16_t an = veorq_u8(a, n); |
87 | uint32x2_t an_reduced = vqmovn_u64(vreinterpretq_u64_u8(an)); |
88 | return vmaxv_u32(an_reduced); |
89 | } else if constexpr (Size == 32) { |
90 | auto _p1 = as_u8(p1); |
91 | auto _p2 = as_u8(p2); |
92 | uint8x16_t a = vld1q_u8(_p1); |
93 | uint8x16_t b = vld1q_u8(_p1 + 16); |
94 | uint8x16_t n = vld1q_u8(_p2); |
95 | uint8x16_t o = vld1q_u8(_p2 + 16); |
96 | uint8x16_t an = veorq_u8(a, n); |
97 | uint8x16_t bo = veorq_u8(b, o); |
98 | // anbo = (a ^ n) | (b ^ o). At least one byte is nonzero if there is |
99 | // a difference between the two buffers. We reduce this value down to 4 |
100 | // bytes in two steps. First, calculate the saturated move value when |
101 | // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get |
102 | // a single 32 bit nonzero value if a mismatch occurred. |
103 | uint8x16_t anbo = vorrq_u8(an, bo); |
104 | uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo)); |
105 | return vmaxv_u32(anbo_reduced); |
106 | } else if constexpr ((Size % BlockSize) == 0) { |
107 | for (size_t offset = 0; offset < Size; offset += BlockSize) |
108 | if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset)) |
109 | return value; |
110 | } else { |
111 | static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented"); |
112 | } |
113 | return BcmpReturnType::zero(); |
114 | } |
115 | |
116 | LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { |
117 | return block(p1 + count - SIZE, p2 + count - SIZE); |
118 | } |
119 | |
120 | LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { |
121 | if constexpr (Size == 16) { |
122 | auto _p1 = as_u8(p1); |
123 | auto _p2 = as_u8(p2); |
124 | uint8x16_t a = vld1q_u8(_p1); |
125 | uint8x16_t b = vld1q_u8(_p1 + count - 16); |
126 | uint8x16_t n = vld1q_u8(_p2); |
127 | uint8x16_t o = vld1q_u8(_p2 + count - 16); |
128 | uint8x16_t an = veorq_u8(a, n); |
129 | uint8x16_t bo = veorq_u8(b, o); |
130 | // anbo = (a ^ n) | (b ^ o) |
131 | uint8x16_t anbo = vorrq_u8(an, bo); |
132 | uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo)); |
133 | return vmaxv_u32(anbo_reduced); |
134 | } else if constexpr (Size == 32) { |
135 | auto _p1 = as_u8(p1); |
136 | auto _p2 = as_u8(p2); |
137 | uint8x16_t a = vld1q_u8(_p1); |
138 | uint8x16_t b = vld1q_u8(_p1 + 16); |
139 | uint8x16_t c = vld1q_u8(_p1 + count - 16); |
140 | uint8x16_t d = vld1q_u8(_p1 + count - 32); |
141 | uint8x16_t n = vld1q_u8(_p2); |
142 | uint8x16_t o = vld1q_u8(_p2 + 16); |
143 | uint8x16_t p = vld1q_u8(_p2 + count - 16); |
144 | uint8x16_t q = vld1q_u8(_p2 + count - 32); |
145 | uint8x16_t an = veorq_u8(a, n); |
146 | uint8x16_t bo = veorq_u8(b, o); |
147 | uint8x16_t cp = veorq_u8(c, p); |
148 | uint8x16_t dq = veorq_u8(d, q); |
149 | uint8x16_t anbo = vorrq_u8(an, bo); |
150 | uint8x16_t cpdq = vorrq_u8(cp, dq); |
151 | // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)). Reduce this to |
152 | // a nonzero 32 bit value if a mismatch occurred. |
153 | uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq); |
154 | uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq); |
155 | return vmaxv_u32(abnocpdq_reduced); |
156 | } else { |
157 | static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented"); |
158 | } |
159 | return BcmpReturnType::zero(); |
160 | } |
161 | |
162 | LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, |
163 | size_t count) { |
164 | static_assert(Size > 1, "a loop of size 1 does not need tail"); |
165 | size_t offset = 0; |
166 | do { |
167 | if (auto value = block(p1 + offset, p2 + offset)) |
168 | return value; |
169 | offset += SIZE; |
170 | } while (offset < count - SIZE); |
171 | return tail(p1, p2, count); |
172 | } |
173 | }; |
174 | |
175 | } // namespace aarch64 |
176 | } // namespace LIBC_NAMESPACE_DECL |
177 | |
178 | #endif //__ARM_NEON |
179 | |
180 | namespace LIBC_NAMESPACE_DECL { |
181 | namespace generic { |
182 | |
183 | /////////////////////////////////////////////////////////////////////////////// |
184 | // Specializations for uint16_t |
185 | template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {}; |
186 | template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) { |
187 | return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset); |
188 | } |
189 | template <> |
190 | LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) { |
191 | return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset); |
192 | } |
193 | template <> |
194 | LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) { |
195 | return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) - |
196 | static_cast<int32_t>(load_be<uint16_t>(p2, offset)); |
197 | } |
198 | |
199 | /////////////////////////////////////////////////////////////////////////////// |
200 | // Specializations for uint32_t |
201 | template <> struct cmp_is_expensive<uint32_t> : cpp::false_type {}; |
202 | template <> |
203 | LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) { |
204 | return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset); |
205 | } |
206 | template <> |
207 | LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) { |
208 | const auto a = load_be<uint32_t>(p1, offset); |
209 | const auto b = load_be<uint32_t>(p2, offset); |
210 | return a > b ? 1 : a < b ? -1 : 0; |
211 | } |
212 | |
213 | /////////////////////////////////////////////////////////////////////////////// |
214 | // Specializations for uint64_t |
215 | template <> struct cmp_is_expensive<uint64_t> : cpp::false_type {}; |
216 | template <> |
217 | LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) { |
218 | return load<uint64_t>(p1, offset) != load<uint64_t>(p2, offset); |
219 | } |
220 | template <> |
221 | LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset) { |
222 | const auto a = load_be<uint64_t>(p1, offset); |
223 | const auto b = load_be<uint64_t>(p2, offset); |
224 | if (a != b) |
225 | return a > b ? 1 : -1; |
226 | return MemcmpReturnType::zero(); |
227 | } |
228 | |
229 | #if defined(__ARM_NEON) |
230 | |
231 | /////////////////////////////////////////////////////////////////////////////// |
232 | // Specializations for uint8x16_t |
233 | template <> struct is_vector<uint8x16_t> : cpp::true_type {}; |
234 | template <> struct cmp_is_expensive<uint8x16_t> : cpp::false_type {}; |
235 | template <> |
236 | LIBC_INLINE uint32_t neq<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) { |
237 | for (size_t i = 0; i < 2; ++i) { |
238 | auto a = load<uint64_t>(p1, offset); |
239 | auto b = load<uint64_t>(p2, offset); |
240 | uint32_t cond = a != b; |
241 | if (cond) |
242 | return cond; |
243 | offset += sizeof(uint64_t); |
244 | } |
245 | return 0; |
246 | } |
247 | template <> |
248 | LIBC_INLINE MemcmpReturnType cmp<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) { |
249 | for (size_t i = 0; i < 2; ++i) { |
250 | auto a = load_be<uint64_t>(p1, offset); |
251 | auto b = load_be<uint64_t>(p2, offset); |
252 | if (a != b) |
253 | return cmp_neq_uint64_t(a, b); |
254 | offset += sizeof(uint64_t); |
255 | } |
256 | return MemcmpReturnType::zero(); |
257 | } |
258 | |
259 | /////////////////////////////////////////////////////////////////////////////// |
260 | // Specializations for uint8x16x2_t |
261 | template <> struct is_vector<uint8x16x2_t> : cpp::true_type {}; |
262 | template <> struct cmp_is_expensive<uint8x16x2_t> : cpp::false_type {}; |
263 | template <> |
264 | LIBC_INLINE MemcmpReturnType cmp<uint8x16x2_t>(CPtr p1, CPtr p2, |
265 | size_t offset) { |
266 | for (size_t i = 0; i < 4; ++i) { |
267 | auto a = load_be<uint64_t>(p1, offset); |
268 | auto b = load_be<uint64_t>(p2, offset); |
269 | if (a != b) |
270 | return cmp_neq_uint64_t(a, b); |
271 | offset += sizeof(uint64_t); |
272 | } |
273 | return MemcmpReturnType::zero(); |
274 | } |
275 | |
276 | #endif // __ARM_NEON |
277 | |
278 | } // namespace generic |
279 | } // namespace LIBC_NAMESPACE_DECL |
280 | |
281 | #endif // LIBC_TARGET_ARCH_IS_AARCH64 |
282 | |
283 | #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H |
284 |
Warning: This file is not a C or C++ file. It does not have highlighting.