1 | //===-- aarch64 implementation of memory function building blocks ---------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file provides aarch64 specific building blocks to compose memory |
10 | // functions. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H |
14 | #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H |
15 | |
16 | #include "src/__support/macros/properties/architectures.h" |
17 | |
18 | #if defined(LIBC_TARGET_ARCH_IS_AARCH64) |
19 | |
20 | #include "src/__support/CPP/type_traits.h" // cpp::always_false |
21 | #include "src/__support/common.h" |
22 | #include "src/string/memory_utils/op_generic.h" |
23 | |
24 | #ifdef __ARM_NEON |
25 | #include <arm_neon.h> |
26 | #endif //__ARM_NEON |
27 | |
28 | namespace LIBC_NAMESPACE::aarch64 { |
29 | |
30 | LIBC_INLINE_VAR constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON); |
31 | |
32 | namespace neon { |
33 | |
34 | struct BzeroCacheLine { |
35 | static constexpr size_t SIZE = 64; |
36 | |
37 | LIBC_INLINE static void block(Ptr dst, uint8_t) { |
38 | #if __SIZEOF_POINTER__ == 4 |
39 | asm("dc zva, %w[dst]" : : [dst] "r" (dst) : "memory" ); |
40 | #else |
41 | asm("dc zva, %[dst]" : : [dst] "r" (dst) : "memory" ); |
42 | #endif |
43 | } |
44 | |
45 | LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) { |
46 | size_t offset = 0; |
47 | do { |
48 | block(dst + offset, value); |
49 | offset += SIZE; |
50 | } while (offset < count - SIZE); |
51 | // Unaligned store, we can't use 'dc zva' here. |
52 | generic::Memset<generic_v512>::tail(dst, value, count); |
53 | } |
54 | }; |
55 | |
56 | LIBC_INLINE bool hasZva() { |
57 | uint64_t zva_val; |
58 | asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r" (zva_val)); |
59 | // DC ZVA is permitted if DZP, bit [4] is zero. |
60 | // BS, bits [3:0] is log2 of the block count in words. |
61 | // So the next line checks whether the instruction is permitted and block |
62 | // count is 16 words (i.e. 64 bytes). |
63 | return (zva_val & 0b11111) == 0b00100; |
64 | } |
65 | |
66 | } // namespace neon |
67 | |
68 | /////////////////////////////////////////////////////////////////////////////// |
69 | // Bcmp |
70 | template <size_t Size> struct Bcmp { |
71 | static constexpr size_t SIZE = Size; |
72 | static constexpr size_t BlockSize = 32; |
73 | |
74 | LIBC_INLINE static const unsigned char *as_u8(CPtr ptr) { |
75 | return reinterpret_cast<const unsigned char *>(ptr); |
76 | } |
77 | |
78 | LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) { |
79 | if constexpr (Size == 16) { |
80 | auto _p1 = as_u8(p1); |
81 | auto _p2 = as_u8(p2); |
82 | uint8x16_t a = vld1q_u8(_p1); |
83 | uint8x16_t n = vld1q_u8(_p2); |
84 | uint8x16_t an = veorq_u8(a, n); |
85 | uint32x2_t an_reduced = vqmovn_u64(vreinterpretq_u64_u8(an)); |
86 | return vmaxv_u32(an_reduced); |
87 | } else if constexpr (Size == 32) { |
88 | auto _p1 = as_u8(p1); |
89 | auto _p2 = as_u8(p2); |
90 | uint8x16_t a = vld1q_u8(_p1); |
91 | uint8x16_t b = vld1q_u8(_p1 + 16); |
92 | uint8x16_t n = vld1q_u8(_p2); |
93 | uint8x16_t o = vld1q_u8(_p2 + 16); |
94 | uint8x16_t an = veorq_u8(a, n); |
95 | uint8x16_t bo = veorq_u8(b, o); |
96 | // anbo = (a ^ n) | (b ^ o). At least one byte is nonzero if there is |
97 | // a difference between the two buffers. We reduce this value down to 4 |
98 | // bytes in two steps. First, calculate the saturated move value when |
99 | // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get |
100 | // a single 32 bit nonzero value if a mismatch occurred. |
101 | uint8x16_t anbo = vorrq_u8(an, bo); |
102 | uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo)); |
103 | return vmaxv_u32(anbo_reduced); |
104 | } else if constexpr ((Size % BlockSize) == 0) { |
105 | for (size_t offset = 0; offset < Size; offset += BlockSize) |
106 | if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset)) |
107 | return value; |
108 | } else { |
109 | static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented" ); |
110 | } |
111 | return BcmpReturnType::zero(); |
112 | } |
113 | |
114 | LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { |
115 | return block(p1 + count - SIZE, p2 + count - SIZE); |
116 | } |
117 | |
118 | LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { |
119 | if constexpr (Size == 16) { |
120 | auto _p1 = as_u8(p1); |
121 | auto _p2 = as_u8(p2); |
122 | uint8x16_t a = vld1q_u8(_p1); |
123 | uint8x16_t b = vld1q_u8(_p1 + count - 16); |
124 | uint8x16_t n = vld1q_u8(_p2); |
125 | uint8x16_t o = vld1q_u8(_p2 + count - 16); |
126 | uint8x16_t an = veorq_u8(a, n); |
127 | uint8x16_t bo = veorq_u8(b, o); |
128 | // anbo = (a ^ n) | (b ^ o) |
129 | uint8x16_t anbo = vorrq_u8(an, bo); |
130 | uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo)); |
131 | return vmaxv_u32(anbo_reduced); |
132 | } else if constexpr (Size == 32) { |
133 | auto _p1 = as_u8(p1); |
134 | auto _p2 = as_u8(p2); |
135 | uint8x16_t a = vld1q_u8(_p1); |
136 | uint8x16_t b = vld1q_u8(_p1 + 16); |
137 | uint8x16_t c = vld1q_u8(_p1 + count - 16); |
138 | uint8x16_t d = vld1q_u8(_p1 + count - 32); |
139 | uint8x16_t n = vld1q_u8(_p2); |
140 | uint8x16_t o = vld1q_u8(_p2 + 16); |
141 | uint8x16_t p = vld1q_u8(_p2 + count - 16); |
142 | uint8x16_t q = vld1q_u8(_p2 + count - 32); |
143 | uint8x16_t an = veorq_u8(a, n); |
144 | uint8x16_t bo = veorq_u8(b, o); |
145 | uint8x16_t cp = veorq_u8(c, p); |
146 | uint8x16_t dq = veorq_u8(d, q); |
147 | uint8x16_t anbo = vorrq_u8(an, bo); |
148 | uint8x16_t cpdq = vorrq_u8(cp, dq); |
149 | // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)). Reduce this to |
150 | // a nonzero 32 bit value if a mismatch occurred. |
151 | uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq); |
152 | uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq); |
153 | return vmaxv_u32(abnocpdq_reduced); |
154 | } else { |
155 | static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented" ); |
156 | } |
157 | return BcmpReturnType::zero(); |
158 | } |
159 | |
160 | LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, |
161 | size_t count) { |
162 | static_assert(Size > 1, "a loop of size 1 does not need tail" ); |
163 | size_t offset = 0; |
164 | do { |
165 | if (auto value = block(p1 + offset, p2 + offset)) |
166 | return value; |
167 | offset += SIZE; |
168 | } while (offset < count - SIZE); |
169 | return tail(p1, p2, count); |
170 | } |
171 | }; |
172 | |
173 | } // namespace LIBC_NAMESPACE::aarch64 |
174 | |
175 | namespace LIBC_NAMESPACE::generic { |
176 | |
177 | /////////////////////////////////////////////////////////////////////////////// |
178 | // Specializations for uint16_t |
179 | template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {}; |
180 | template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) { |
181 | return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset); |
182 | } |
183 | template <> |
184 | LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) { |
185 | return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset); |
186 | } |
187 | template <> |
188 | LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) { |
189 | return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) - |
190 | static_cast<int32_t>(load_be<uint16_t>(p2, offset)); |
191 | } |
192 | |
193 | /////////////////////////////////////////////////////////////////////////////// |
194 | // Specializations for uint32_t |
195 | template <> struct cmp_is_expensive<uint32_t> : cpp::false_type {}; |
196 | template <> |
197 | LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) { |
198 | return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset); |
199 | } |
200 | template <> |
201 | LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) { |
202 | const auto a = load_be<uint32_t>(p1, offset); |
203 | const auto b = load_be<uint32_t>(p2, offset); |
204 | return a > b ? 1 : a < b ? -1 : 0; |
205 | } |
206 | |
207 | /////////////////////////////////////////////////////////////////////////////// |
208 | // Specializations for uint64_t |
209 | template <> struct cmp_is_expensive<uint64_t> : cpp::false_type {}; |
210 | template <> |
211 | LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) { |
212 | return load<uint64_t>(p1, offset) != load<uint64_t>(p2, offset); |
213 | } |
214 | template <> |
215 | LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset) { |
216 | const auto a = load_be<uint64_t>(p1, offset); |
217 | const auto b = load_be<uint64_t>(p2, offset); |
218 | if (a != b) |
219 | return a > b ? 1 : -1; |
220 | return MemcmpReturnType::zero(); |
221 | } |
222 | |
223 | /////////////////////////////////////////////////////////////////////////////// |
224 | // Specializations for uint8x16_t |
225 | template <> struct is_vector<uint8x16_t> : cpp::true_type {}; |
226 | template <> struct cmp_is_expensive<uint8x16_t> : cpp::false_type {}; |
227 | template <> |
228 | LIBC_INLINE uint32_t neq<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) { |
229 | for (size_t i = 0; i < 2; ++i) { |
230 | auto a = load<uint64_t>(p1, offset); |
231 | auto b = load<uint64_t>(p2, offset); |
232 | uint32_t cond = a != b; |
233 | if (cond) |
234 | return cond; |
235 | offset += sizeof(uint64_t); |
236 | } |
237 | return 0; |
238 | } |
239 | template <> |
240 | LIBC_INLINE MemcmpReturnType cmp<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) { |
241 | for (size_t i = 0; i < 2; ++i) { |
242 | auto a = load_be<uint64_t>(p1, offset); |
243 | auto b = load_be<uint64_t>(p2, offset); |
244 | if (a != b) |
245 | return cmp_neq_uint64_t(a, b); |
246 | offset += sizeof(uint64_t); |
247 | } |
248 | return MemcmpReturnType::zero(); |
249 | } |
250 | |
251 | /////////////////////////////////////////////////////////////////////////////// |
252 | // Specializations for uint8x16x2_t |
253 | template <> struct is_vector<uint8x16x2_t> : cpp::true_type {}; |
254 | template <> struct cmp_is_expensive<uint8x16x2_t> : cpp::false_type {}; |
255 | template <> |
256 | LIBC_INLINE MemcmpReturnType cmp<uint8x16x2_t>(CPtr p1, CPtr p2, |
257 | size_t offset) { |
258 | for (size_t i = 0; i < 4; ++i) { |
259 | auto a = load_be<uint64_t>(p1, offset); |
260 | auto b = load_be<uint64_t>(p2, offset); |
261 | if (a != b) |
262 | return cmp_neq_uint64_t(a, b); |
263 | offset += sizeof(uint64_t); |
264 | } |
265 | return MemcmpReturnType::zero(); |
266 | } |
267 | } // namespace LIBC_NAMESPACE::generic |
268 | |
269 | #endif // LIBC_TARGET_ARCH_IS_AARCH64 |
270 | |
271 | #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H |
272 | |