Warning: This file is not a C or C++ file. It does not have highlighting.

1//===-- aarch64 implementation of memory function building blocks ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file provides aarch64 specific building blocks to compose memory
10// functions.
11//
12//===----------------------------------------------------------------------===//
13#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
14#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
15
16#include "src/__support/macros/attributes.h" // LIBC_INLINE
17#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL
18#include "src/__support/macros/properties/architectures.h"
19
20#if defined(LIBC_TARGET_ARCH_IS_AARCH64)
21
22#include "src/__support/CPP/type_traits.h" // cpp::always_false
23#include "src/__support/common.h"
24#include "src/string/memory_utils/op_generic.h"
25
26#ifdef __ARM_NEON
27#include <arm_neon.h>
28
29namespace LIBC_NAMESPACE_DECL {
30namespace aarch64 {
31
32LIBC_INLINE_VAR constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);
33
34namespace neon {
35
36struct BzeroCacheLine {
37 static constexpr size_t SIZE = 64;
38
39 LIBC_INLINE static void block(Ptr dst, uint8_t) {
40#if __SIZEOF_POINTER__ == 4
41 asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
42#else
43 asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
44#endif
45 }
46
47 LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
48 size_t offset = 0;
49 do {
50 block(dst + offset, value);
51 offset += SIZE;
52 } while (offset < count - SIZE);
53 // Unaligned store, we can't use 'dc zva' here.
54 generic::Memset<generic_v512>::tail(dst, value, count);
55 }
56};
57
58LIBC_INLINE bool hasZva() {
59 uint64_t zva_val;
60 asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
61 // DC ZVA is permitted if DZP, bit [4] is zero.
62 // BS, bits [3:0] is log2 of the block count in words.
63 // So the next line checks whether the instruction is permitted and block
64 // count is 16 words (i.e. 64 bytes).
65 return (zva_val & 0b11111) == 0b00100;
66}
67
68} // namespace neon
69
70///////////////////////////////////////////////////////////////////////////////
71// Bcmp
72template <size_t Size> struct Bcmp {
73 static constexpr size_t SIZE = Size;
74 static constexpr size_t BlockSize = 32;
75
76 LIBC_INLINE static const unsigned char *as_u8(CPtr ptr) {
77 return reinterpret_cast<const unsigned char *>(ptr);
78 }
79
80 LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
81 if constexpr (Size == 16) {
82 auto _p1 = as_u8(p1);
83 auto _p2 = as_u8(p2);
84 uint8x16_t a = vld1q_u8(_p1);
85 uint8x16_t n = vld1q_u8(_p2);
86 uint8x16_t an = veorq_u8(a, n);
87 uint32x2_t an_reduced = vqmovn_u64(vreinterpretq_u64_u8(an));
88 return vmaxv_u32(an_reduced);
89 } else if constexpr (Size == 32) {
90 auto _p1 = as_u8(p1);
91 auto _p2 = as_u8(p2);
92 uint8x16_t a = vld1q_u8(_p1);
93 uint8x16_t b = vld1q_u8(_p1 + 16);
94 uint8x16_t n = vld1q_u8(_p2);
95 uint8x16_t o = vld1q_u8(_p2 + 16);
96 uint8x16_t an = veorq_u8(a, n);
97 uint8x16_t bo = veorq_u8(b, o);
98 // anbo = (a ^ n) | (b ^ o). At least one byte is nonzero if there is
99 // a difference between the two buffers. We reduce this value down to 4
100 // bytes in two steps. First, calculate the saturated move value when
101 // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
102 // a single 32 bit nonzero value if a mismatch occurred.
103 uint8x16_t anbo = vorrq_u8(an, bo);
104 uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
105 return vmaxv_u32(anbo_reduced);
106 } else if constexpr ((Size % BlockSize) == 0) {
107 for (size_t offset = 0; offset < Size; offset += BlockSize)
108 if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
109 return value;
110 } else {
111 static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented");
112 }
113 return BcmpReturnType::zero();
114 }
115
116 LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
117 return block(p1 + count - SIZE, p2 + count - SIZE);
118 }
119
120 LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
121 if constexpr (Size == 16) {
122 auto _p1 = as_u8(p1);
123 auto _p2 = as_u8(p2);
124 uint8x16_t a = vld1q_u8(_p1);
125 uint8x16_t b = vld1q_u8(_p1 + count - 16);
126 uint8x16_t n = vld1q_u8(_p2);
127 uint8x16_t o = vld1q_u8(_p2 + count - 16);
128 uint8x16_t an = veorq_u8(a, n);
129 uint8x16_t bo = veorq_u8(b, o);
130 // anbo = (a ^ n) | (b ^ o)
131 uint8x16_t anbo = vorrq_u8(an, bo);
132 uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
133 return vmaxv_u32(anbo_reduced);
134 } else if constexpr (Size == 32) {
135 auto _p1 = as_u8(p1);
136 auto _p2 = as_u8(p2);
137 uint8x16_t a = vld1q_u8(_p1);
138 uint8x16_t b = vld1q_u8(_p1 + 16);
139 uint8x16_t c = vld1q_u8(_p1 + count - 16);
140 uint8x16_t d = vld1q_u8(_p1 + count - 32);
141 uint8x16_t n = vld1q_u8(_p2);
142 uint8x16_t o = vld1q_u8(_p2 + 16);
143 uint8x16_t p = vld1q_u8(_p2 + count - 16);
144 uint8x16_t q = vld1q_u8(_p2 + count - 32);
145 uint8x16_t an = veorq_u8(a, n);
146 uint8x16_t bo = veorq_u8(b, o);
147 uint8x16_t cp = veorq_u8(c, p);
148 uint8x16_t dq = veorq_u8(d, q);
149 uint8x16_t anbo = vorrq_u8(an, bo);
150 uint8x16_t cpdq = vorrq_u8(cp, dq);
151 // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)). Reduce this to
152 // a nonzero 32 bit value if a mismatch occurred.
153 uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
154 uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
155 return vmaxv_u32(abnocpdq_reduced);
156 } else {
157 static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented");
158 }
159 return BcmpReturnType::zero();
160 }
161
162 LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
163 size_t count) {
164 static_assert(Size > 1, "a loop of size 1 does not need tail");
165 size_t offset = 0;
166 do {
167 if (auto value = block(p1 + offset, p2 + offset))
168 return value;
169 offset += SIZE;
170 } while (offset < count - SIZE);
171 return tail(p1, p2, count);
172 }
173};
174
175} // namespace aarch64
176} // namespace LIBC_NAMESPACE_DECL
177
178#endif //__ARM_NEON
179
180namespace LIBC_NAMESPACE_DECL {
181namespace generic {
182
183///////////////////////////////////////////////////////////////////////////////
184// Specializations for uint16_t
185template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
186template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
187 return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
188}
189template <>
190LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
191 return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
192}
193template <>
194LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
195 return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
196 static_cast<int32_t>(load_be<uint16_t>(p2, offset));
197}
198
199///////////////////////////////////////////////////////////////////////////////
200// Specializations for uint32_t
201template <> struct cmp_is_expensive<uint32_t> : cpp::false_type {};
202template <>
203LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
204 return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
205}
206template <>
207LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
208 const auto a = load_be<uint32_t>(p1, offset);
209 const auto b = load_be<uint32_t>(p2, offset);
210 return a > b ? 1 : a < b ? -1 : 0;
211}
212
213///////////////////////////////////////////////////////////////////////////////
214// Specializations for uint64_t
215template <> struct cmp_is_expensive<uint64_t> : cpp::false_type {};
216template <>
217LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
218 return load<uint64_t>(p1, offset) != load<uint64_t>(p2, offset);
219}
220template <>
221LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
222 const auto a = load_be<uint64_t>(p1, offset);
223 const auto b = load_be<uint64_t>(p2, offset);
224 if (a != b)
225 return a > b ? 1 : -1;
226 return MemcmpReturnType::zero();
227}
228
229#if defined(__ARM_NEON)
230
231///////////////////////////////////////////////////////////////////////////////
232// Specializations for uint8x16_t
233template <> struct is_vector<uint8x16_t> : cpp::true_type {};
234template <> struct cmp_is_expensive<uint8x16_t> : cpp::false_type {};
235template <>
236LIBC_INLINE uint32_t neq<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
237 for (size_t i = 0; i < 2; ++i) {
238 auto a = load<uint64_t>(p1, offset);
239 auto b = load<uint64_t>(p2, offset);
240 uint32_t cond = a != b;
241 if (cond)
242 return cond;
243 offset += sizeof(uint64_t);
244 }
245 return 0;
246}
247template <>
248LIBC_INLINE MemcmpReturnType cmp<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
249 for (size_t i = 0; i < 2; ++i) {
250 auto a = load_be<uint64_t>(p1, offset);
251 auto b = load_be<uint64_t>(p2, offset);
252 if (a != b)
253 return cmp_neq_uint64_t(a, b);
254 offset += sizeof(uint64_t);
255 }
256 return MemcmpReturnType::zero();
257}
258
259///////////////////////////////////////////////////////////////////////////////
260// Specializations for uint8x16x2_t
261template <> struct is_vector<uint8x16x2_t> : cpp::true_type {};
262template <> struct cmp_is_expensive<uint8x16x2_t> : cpp::false_type {};
263template <>
264LIBC_INLINE MemcmpReturnType cmp<uint8x16x2_t>(CPtr p1, CPtr p2,
265 size_t offset) {
266 for (size_t i = 0; i < 4; ++i) {
267 auto a = load_be<uint64_t>(p1, offset);
268 auto b = load_be<uint64_t>(p2, offset);
269 if (a != b)
270 return cmp_neq_uint64_t(a, b);
271 offset += sizeof(uint64_t);
272 }
273 return MemcmpReturnType::zero();
274}
275
276#endif // __ARM_NEON
277
278} // namespace generic
279} // namespace LIBC_NAMESPACE_DECL
280
281#endif // LIBC_TARGET_ARCH_IS_AARCH64
282
283#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
284

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of libc/src/string/memory_utils/op_aarch64.h