op_aarch64.h source code [libc/src/string/memory_utils/op_aarch64.h]

1	//===-- aarch64 implementation of memory function building blocks ---------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file provides aarch64 specific building blocks to compose memory
10	// functions.
11	//
12	//===----------------------------------------------------------------------===//
13	#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
14	#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
15
16	#include "src/__support/macros/properties/architectures.h"
17
18	#if defined(LIBC_TARGET_ARCH_IS_AARCH64)
19
20	#include "src/__support/CPP/type_traits.h" // cpp::always_false
21	#include "src/__support/common.h"
22	#include "src/string/memory_utils/op_generic.h"
23
24	#ifdef __ARM_NEON
25	#include <arm_neon.h>
26	#endif //__ARM_NEON
27
28	namespace LIBC_NAMESPACE::aarch64 {
29
30	LIBC_INLINE_VAR constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);
31
32	namespace neon {
33
34	struct BzeroCacheLine {
35	static constexpr size_t SIZE = `64`;
36
37	LIBC_INLINE static void block(Ptr dst, uint8_t) {
38	#if __SIZEOF_POINTER__ == 4
39	asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
40	#else
41	asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
42	#endif
43	}
44
45	LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
46	size_t offset = `0`;
47	do {
48	block(dst + offset, value);
49	offset += SIZE;
50	} while (offset < count - SIZE);
51	// Unaligned store, we can't use 'dc zva' here.
52	generic::Memset<generic_v512>::tail(dst, value, count);
53	}
54	};
55
56	LIBC_INLINE bool hasZva() {
57	uint64_t zva_val;
58	asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
59	// DC ZVA is permitted if DZP, bit [4] is zero.
60	// BS, bits [3:0] is log2 of the block count in words.
61	// So the next line checks whether the instruction is permitted and block
62	// count is 16 words (i.e. 64 bytes).
63	return (zva_val & `0b11111`) == `0b00100`;
64	}
65
66	} // namespace neon
67
68	///////////////////////////////////////////////////////////////////////////////
69	// Bcmp
70	template <size_t Size> struct Bcmp {
71	static constexpr size_t SIZE = Size;
72	static constexpr size_t BlockSize = `32`;
73
74	LIBC_INLINE static const unsigned char *as_u8(CPtr ptr) {
75	return reinterpret_cast<const unsigned char *>(ptr);
76	}
77
78	LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
79	if constexpr (Size == `16`) {
80	auto _p1 = as_u8(p1);
81	auto _p2 = as_u8(p2);
82	uint8x16_t a = vld1q_u8(_p1);
83	uint8x16_t n = vld1q_u8(_p2);
84	uint8x16_t an = veorq_u8(a, n);
85	uint32x2_t an_reduced = vqmovn_u64(vreinterpretq_u64_u8(an));
86	return vmaxv_u32(an_reduced);
87	} else if constexpr (Size == `32`) {
88	auto _p1 = as_u8(p1);
89	auto _p2 = as_u8(p2);
90	uint8x16_t a = vld1q_u8(_p1);
91	uint8x16_t b = vld1q_u8(_p1 + `16`);
92	uint8x16_t n = vld1q_u8(_p2);
93	uint8x16_t o = vld1q_u8(_p2 + `16`);
94	uint8x16_t an = veorq_u8(a, n);
95	uint8x16_t bo = veorq_u8(b, o);
96	// anbo = (a ^ n) \| (b ^ o). At least one byte is nonzero if there is
97	// a difference between the two buffers. We reduce this value down to 4
98	// bytes in two steps. First, calculate the saturated move value when
99	// going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
100	// a single 32 bit nonzero value if a mismatch occurred.
101	uint8x16_t anbo = vorrq_u8(an, bo);
102	uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
103	return vmaxv_u32(anbo_reduced);
104	} else if constexpr ((Size % BlockSize) == `0`) {
105	for (size_t offset = `0`; offset < Size; offset += BlockSize)
106	if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
107	return value;
108	} else {
109	static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented");
110	}
111	return BcmpReturnType::zero();
112	}
113
114	LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
115	return block(p1 + count - SIZE, p2 + count - SIZE);
116	}
117
118	LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
119	if constexpr (Size == `16`) {
120	auto _p1 = as_u8(p1);
121	auto _p2 = as_u8(p2);
122	uint8x16_t a = vld1q_u8(_p1);
123	uint8x16_t b = vld1q_u8(_p1 + count - `16`);
124	uint8x16_t n = vld1q_u8(_p2);
125	uint8x16_t o = vld1q_u8(_p2 + count - `16`);
126	uint8x16_t an = veorq_u8(a, n);
127	uint8x16_t bo = veorq_u8(b, o);
128	// anbo = (a ^ n) \| (b ^ o)
129	uint8x16_t anbo = vorrq_u8(an, bo);
130	uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
131	return vmaxv_u32(anbo_reduced);
132	} else if constexpr (Size == `32`) {
133	auto _p1 = as_u8(p1);
134	auto _p2 = as_u8(p2);
135	uint8x16_t a = vld1q_u8(_p1);
136	uint8x16_t b = vld1q_u8(_p1 + `16`);
137	uint8x16_t c = vld1q_u8(_p1 + count - `16`);
138	uint8x16_t d = vld1q_u8(_p1 + count - `32`);
139	uint8x16_t n = vld1q_u8(_p2);
140	uint8x16_t o = vld1q_u8(_p2 + `16`);
141	uint8x16_t p = vld1q_u8(_p2 + count - `16`);
142	uint8x16_t q = vld1q_u8(_p2 + count - `32`);
143	uint8x16_t an = veorq_u8(a, n);
144	uint8x16_t bo = veorq_u8(b, o);
145	uint8x16_t cp = veorq_u8(c, p);
146	uint8x16_t dq = veorq_u8(d, q);
147	uint8x16_t anbo = vorrq_u8(an, bo);
148	uint8x16_t cpdq = vorrq_u8(cp, dq);
149	// abnocpdq = ((a ^ n) \| (b ^ o)) \| ((c ^ p) \| (d ^ q)). Reduce this to
150	// a nonzero 32 bit value if a mismatch occurred.
151	uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo \| cpdq);
152	uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
153	return vmaxv_u32(abnocpdq_reduced);
154	} else {
155	static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented");
156	}
157	return BcmpReturnType::zero();
158	}
159
160	LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
161	size_t count) {
162	static_assert(Size > `1`, "a loop of size 1 does not need tail");
163	size_t offset = `0`;
164	do {
165	if (auto value = block(p1 + offset, p2 + offset))
166	return value;
167	offset += SIZE;
168	} while (offset < count - SIZE);
169	return tail(p1, p2, count);
170	}
171	};
172
173	} // namespace LIBC_NAMESPACE::aarch64
174
175	namespace LIBC_NAMESPACE::generic {
176
177	///////////////////////////////////////////////////////////////////////////////
178	// Specializations for uint16_t
179	template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
180	template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
181	return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
182	}
183	template <>
184	LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
185	return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
186	}
187	template <>
188	LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
189	return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
190	static_cast<int32_t>(load_be<uint16_t>(p2, offset));
191	}
192
193	///////////////////////////////////////////////////////////////////////////////
194	// Specializations for uint32_t
195	template <> struct cmp_is_expensive<uint32_t> : cpp::false_type {};
196	template <>
197	LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
198	return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
199	}
200	template <>
201	LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
202	const auto a = load_be<uint32_t>(p1, offset);
203	const auto b = load_be<uint32_t>(p2, offset);
204	return a > b ? `1` : a < b ? -`1` : `0`;
205	}
206
207	///////////////////////////////////////////////////////////////////////////////
208	// Specializations for uint64_t
209	template <> struct cmp_is_expensive<uint64_t> : cpp::false_type {};
210	template <>
211	LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
212	return load<uint64_t>(p1, offset) != load<uint64_t>(p2, offset);
213	}
214	template <>
215	LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
216	const auto a = load_be<uint64_t>(p1, offset);
217	const auto b = load_be<uint64_t>(p2, offset);
218	if (a != b)
219	return a > b ? `1` : -`1`;
220	return MemcmpReturnType::zero();
221	}
222
223	///////////////////////////////////////////////////////////////////////////////
224	// Specializations for uint8x16_t
225	template <> struct is_vector<uint8x16_t> : cpp::true_type {};
226	template <> struct cmp_is_expensive<uint8x16_t> : cpp::false_type {};
227	template <>
228	LIBC_INLINE uint32_t neq<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
229	for (size_t i = `0`; i < `2`; ++i) {
230	auto a = load<uint64_t>(p1, offset);
231	auto b = load<uint64_t>(p2, offset);
232	uint32_t cond = a != b;
233	if (cond)
234	return cond;
235	offset += sizeof(uint64_t);
236	}
237	return `0`;
238	}
239	template <>
240	LIBC_INLINE MemcmpReturnType cmp<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
241	for (size_t i = `0`; i < `2`; ++i) {
242	auto a = load_be<uint64_t>(p1, offset);
243	auto b = load_be<uint64_t>(p2, offset);
244	if (a != b)
245	return cmp_neq_uint64_t(a, b);
246	offset += sizeof(uint64_t);
247	}
248	return MemcmpReturnType::zero();
249	}
250
251	///////////////////////////////////////////////////////////////////////////////
252	// Specializations for uint8x16x2_t
253	template <> struct is_vector<uint8x16x2_t> : cpp::true_type {};
254	template <> struct cmp_is_expensive<uint8x16x2_t> : cpp::false_type {};
255	template <>
256	LIBC_INLINE MemcmpReturnType cmp<uint8x16x2_t>(CPtr p1, CPtr p2,
257	size_t offset) {
258	for (size_t i = `0`; i < `4`; ++i) {
259	auto a = load_be<uint64_t>(p1, offset);
260	auto b = load_be<uint64_t>(p2, offset);
261	if (a != b)
262	return cmp_neq_uint64_t(a, b);
263	offset += sizeof(uint64_t);
264	}
265	return MemcmpReturnType::zero();
266	}
267	} // namespace LIBC_NAMESPACE::generic
268
269	#endif // LIBC_TARGET_ARCH_IS_AARCH64
270
271	#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
272

source code of libc/src/string/memory_utils/op_aarch64.h