1//===-- aarch64 implementation of memory function building blocks ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file provides aarch64 specific building blocks to compose memory
10// functions.
11//
12//===----------------------------------------------------------------------===//
13#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
14#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
15
16#include "src/__support/macros/properties/architectures.h"
17
18#if defined(LIBC_TARGET_ARCH_IS_AARCH64)
19
20#include "src/__support/CPP/type_traits.h" // cpp::always_false
21#include "src/__support/common.h"
22#include "src/string/memory_utils/op_generic.h"
23
24#ifdef __ARM_NEON
25#include <arm_neon.h>
26#endif //__ARM_NEON
27
28namespace LIBC_NAMESPACE::aarch64 {
29
30LIBC_INLINE_VAR constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);
31
32namespace neon {
33
34struct BzeroCacheLine {
35 static constexpr size_t SIZE = 64;
36
37 LIBC_INLINE static void block(Ptr dst, uint8_t) {
38#if __SIZEOF_POINTER__ == 4
39 asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
40#else
41 asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
42#endif
43 }
44
45 LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
46 size_t offset = 0;
47 do {
48 block(dst + offset, value);
49 offset += SIZE;
50 } while (offset < count - SIZE);
51 // Unaligned store, we can't use 'dc zva' here.
52 generic::Memset<generic_v512>::tail(dst, value, count);
53 }
54};
55
56LIBC_INLINE bool hasZva() {
57 uint64_t zva_val;
58 asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
59 // DC ZVA is permitted if DZP, bit [4] is zero.
60 // BS, bits [3:0] is log2 of the block count in words.
61 // So the next line checks whether the instruction is permitted and block
62 // count is 16 words (i.e. 64 bytes).
63 return (zva_val & 0b11111) == 0b00100;
64}
65
66} // namespace neon
67
68///////////////////////////////////////////////////////////////////////////////
69// Bcmp
70template <size_t Size> struct Bcmp {
71 static constexpr size_t SIZE = Size;
72 static constexpr size_t BlockSize = 32;
73
74 LIBC_INLINE static const unsigned char *as_u8(CPtr ptr) {
75 return reinterpret_cast<const unsigned char *>(ptr);
76 }
77
78 LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
79 if constexpr (Size == 16) {
80 auto _p1 = as_u8(p1);
81 auto _p2 = as_u8(p2);
82 uint8x16_t a = vld1q_u8(_p1);
83 uint8x16_t n = vld1q_u8(_p2);
84 uint8x16_t an = veorq_u8(a, n);
85 uint32x2_t an_reduced = vqmovn_u64(vreinterpretq_u64_u8(an));
86 return vmaxv_u32(an_reduced);
87 } else if constexpr (Size == 32) {
88 auto _p1 = as_u8(p1);
89 auto _p2 = as_u8(p2);
90 uint8x16_t a = vld1q_u8(_p1);
91 uint8x16_t b = vld1q_u8(_p1 + 16);
92 uint8x16_t n = vld1q_u8(_p2);
93 uint8x16_t o = vld1q_u8(_p2 + 16);
94 uint8x16_t an = veorq_u8(a, n);
95 uint8x16_t bo = veorq_u8(b, o);
96 // anbo = (a ^ n) | (b ^ o). At least one byte is nonzero if there is
97 // a difference between the two buffers. We reduce this value down to 4
98 // bytes in two steps. First, calculate the saturated move value when
99 // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
100 // a single 32 bit nonzero value if a mismatch occurred.
101 uint8x16_t anbo = vorrq_u8(an, bo);
102 uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
103 return vmaxv_u32(anbo_reduced);
104 } else if constexpr ((Size % BlockSize) == 0) {
105 for (size_t offset = 0; offset < Size; offset += BlockSize)
106 if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
107 return value;
108 } else {
109 static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented");
110 }
111 return BcmpReturnType::zero();
112 }
113
114 LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
115 return block(p1 + count - SIZE, p2 + count - SIZE);
116 }
117
118 LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
119 if constexpr (Size == 16) {
120 auto _p1 = as_u8(p1);
121 auto _p2 = as_u8(p2);
122 uint8x16_t a = vld1q_u8(_p1);
123 uint8x16_t b = vld1q_u8(_p1 + count - 16);
124 uint8x16_t n = vld1q_u8(_p2);
125 uint8x16_t o = vld1q_u8(_p2 + count - 16);
126 uint8x16_t an = veorq_u8(a, n);
127 uint8x16_t bo = veorq_u8(b, o);
128 // anbo = (a ^ n) | (b ^ o)
129 uint8x16_t anbo = vorrq_u8(an, bo);
130 uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
131 return vmaxv_u32(anbo_reduced);
132 } else if constexpr (Size == 32) {
133 auto _p1 = as_u8(p1);
134 auto _p2 = as_u8(p2);
135 uint8x16_t a = vld1q_u8(_p1);
136 uint8x16_t b = vld1q_u8(_p1 + 16);
137 uint8x16_t c = vld1q_u8(_p1 + count - 16);
138 uint8x16_t d = vld1q_u8(_p1 + count - 32);
139 uint8x16_t n = vld1q_u8(_p2);
140 uint8x16_t o = vld1q_u8(_p2 + 16);
141 uint8x16_t p = vld1q_u8(_p2 + count - 16);
142 uint8x16_t q = vld1q_u8(_p2 + count - 32);
143 uint8x16_t an = veorq_u8(a, n);
144 uint8x16_t bo = veorq_u8(b, o);
145 uint8x16_t cp = veorq_u8(c, p);
146 uint8x16_t dq = veorq_u8(d, q);
147 uint8x16_t anbo = vorrq_u8(an, bo);
148 uint8x16_t cpdq = vorrq_u8(cp, dq);
149 // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)). Reduce this to
150 // a nonzero 32 bit value if a mismatch occurred.
151 uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
152 uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
153 return vmaxv_u32(abnocpdq_reduced);
154 } else {
155 static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented");
156 }
157 return BcmpReturnType::zero();
158 }
159
160 LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
161 size_t count) {
162 static_assert(Size > 1, "a loop of size 1 does not need tail");
163 size_t offset = 0;
164 do {
165 if (auto value = block(p1 + offset, p2 + offset))
166 return value;
167 offset += SIZE;
168 } while (offset < count - SIZE);
169 return tail(p1, p2, count);
170 }
171};
172
173} // namespace LIBC_NAMESPACE::aarch64
174
175namespace LIBC_NAMESPACE::generic {
176
177///////////////////////////////////////////////////////////////////////////////
178// Specializations for uint16_t
179template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
180template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
181 return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
182}
183template <>
184LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
185 return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
186}
187template <>
188LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
189 return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
190 static_cast<int32_t>(load_be<uint16_t>(p2, offset));
191}
192
193///////////////////////////////////////////////////////////////////////////////
194// Specializations for uint32_t
195template <> struct cmp_is_expensive<uint32_t> : cpp::false_type {};
196template <>
197LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
198 return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
199}
200template <>
201LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
202 const auto a = load_be<uint32_t>(p1, offset);
203 const auto b = load_be<uint32_t>(p2, offset);
204 return a > b ? 1 : a < b ? -1 : 0;
205}
206
207///////////////////////////////////////////////////////////////////////////////
208// Specializations for uint64_t
209template <> struct cmp_is_expensive<uint64_t> : cpp::false_type {};
210template <>
211LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
212 return load<uint64_t>(p1, offset) != load<uint64_t>(p2, offset);
213}
214template <>
215LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
216 const auto a = load_be<uint64_t>(p1, offset);
217 const auto b = load_be<uint64_t>(p2, offset);
218 if (a != b)
219 return a > b ? 1 : -1;
220 return MemcmpReturnType::zero();
221}
222
223///////////////////////////////////////////////////////////////////////////////
224// Specializations for uint8x16_t
225template <> struct is_vector<uint8x16_t> : cpp::true_type {};
226template <> struct cmp_is_expensive<uint8x16_t> : cpp::false_type {};
227template <>
228LIBC_INLINE uint32_t neq<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
229 for (size_t i = 0; i < 2; ++i) {
230 auto a = load<uint64_t>(p1, offset);
231 auto b = load<uint64_t>(p2, offset);
232 uint32_t cond = a != b;
233 if (cond)
234 return cond;
235 offset += sizeof(uint64_t);
236 }
237 return 0;
238}
239template <>
240LIBC_INLINE MemcmpReturnType cmp<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
241 for (size_t i = 0; i < 2; ++i) {
242 auto a = load_be<uint64_t>(p1, offset);
243 auto b = load_be<uint64_t>(p2, offset);
244 if (a != b)
245 return cmp_neq_uint64_t(a, b);
246 offset += sizeof(uint64_t);
247 }
248 return MemcmpReturnType::zero();
249}
250
251///////////////////////////////////////////////////////////////////////////////
252// Specializations for uint8x16x2_t
253template <> struct is_vector<uint8x16x2_t> : cpp::true_type {};
254template <> struct cmp_is_expensive<uint8x16x2_t> : cpp::false_type {};
255template <>
256LIBC_INLINE MemcmpReturnType cmp<uint8x16x2_t>(CPtr p1, CPtr p2,
257 size_t offset) {
258 for (size_t i = 0; i < 4; ++i) {
259 auto a = load_be<uint64_t>(p1, offset);
260 auto b = load_be<uint64_t>(p2, offset);
261 if (a != b)
262 return cmp_neq_uint64_t(a, b);
263 offset += sizeof(uint64_t);
264 }
265 return MemcmpReturnType::zero();
266}
267} // namespace LIBC_NAMESPACE::generic
268
269#endif // LIBC_TARGET_ARCH_IS_AARCH64
270
271#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
272

source code of libc/src/string/memory_utils/op_aarch64.h