1 | //===-- Generic implementation of memory function building blocks ---------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file provides generic C++ building blocks. |
10 | // Depending on the requested size, the block operation uses unsigned integral |
11 | // types, vector types or an array of the type with the maximum size. |
12 | // |
13 | // The maximum size is passed as a template argument. For instance, on x86 |
14 | // platforms that only supports integral types the maximum size would be 8 |
15 | // (corresponding to uint64_t). On this platform if we request the size 32, this |
16 | // would be treated as a cpp::array<uint64_t, 4>. |
17 | // |
18 | // On the other hand, if the platform is x86 with support for AVX the maximum |
19 | // size is 32 and the operation can be handled with a single native operation. |
20 | // |
21 | //===----------------------------------------------------------------------===// |
22 | |
23 | #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H |
24 | #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H |
25 | |
26 | #include "src/__support/CPP/array.h" |
27 | #include "src/__support/CPP/type_traits.h" |
28 | #include "src/__support/common.h" |
29 | #include "src/__support/endian.h" |
30 | #include "src/__support/macros/optimization.h" |
31 | #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT64 |
32 | #include "src/string/memory_utils/op_builtin.h" |
33 | #include "src/string/memory_utils/utils.h" |
34 | |
35 | #include <stdint.h> |
36 | |
37 | static_assert((UINTPTR_MAX == 4294967295U) || |
38 | (UINTPTR_MAX == 18446744073709551615UL), |
39 | "We currently only support 32- or 64-bit platforms" ); |
40 | |
41 | namespace LIBC_NAMESPACE { |
42 | // Compiler types using the vector attributes. |
43 | using generic_v128 = uint8_t __attribute__((__vector_size__(16))); |
44 | using generic_v256 = uint8_t __attribute__((__vector_size__(32))); |
45 | using generic_v512 = uint8_t __attribute__((__vector_size__(64))); |
46 | } // namespace LIBC_NAMESPACE |
47 | |
48 | namespace LIBC_NAMESPACE::generic { |
49 | |
50 | // We accept three types of values as elements for generic operations: |
51 | // - scalar : unsigned integral types, |
52 | // - vector : compiler types using the vector attributes or platform builtins, |
53 | // - array : a cpp::array<T, N> where T is itself either a scalar or a vector. |
54 | // The following traits help discriminate between these cases. |
55 | |
56 | template <typename T> struct is_scalar : cpp::false_type {}; |
57 | template <> struct is_scalar<uint8_t> : cpp::true_type {}; |
58 | template <> struct is_scalar<uint16_t> : cpp::true_type {}; |
59 | template <> struct is_scalar<uint32_t> : cpp::true_type {}; |
60 | #ifdef LIBC_TYPES_HAS_INT64 |
61 | template <> struct is_scalar<uint64_t> : cpp::true_type {}; |
62 | #endif // LIBC_TYPES_HAS_INT64 |
63 | // Meant to match std::numeric_limits interface. |
64 | // NOLINTNEXTLINE(readability-identifier-naming) |
65 | template <typename T> constexpr bool is_scalar_v = is_scalar<T>::value; |
66 | |
67 | template <typename T> struct is_vector : cpp::false_type {}; |
68 | template <> struct is_vector<generic_v128> : cpp::true_type {}; |
69 | template <> struct is_vector<generic_v256> : cpp::true_type {}; |
70 | template <> struct is_vector<generic_v512> : cpp::true_type {}; |
71 | // Meant to match std::numeric_limits interface. |
72 | // NOLINTNEXTLINE(readability-identifier-naming) |
73 | template <typename T> constexpr bool is_vector_v = is_vector<T>::value; |
74 | |
75 | template <class T> struct is_array : cpp::false_type {}; |
76 | template <class T, size_t N> struct is_array<cpp::array<T, N>> { |
77 | // Meant to match std::numeric_limits interface. |
78 | // NOLINTNEXTLINE(readability-identifier-naming) |
79 | static constexpr bool value = is_scalar_v<T> || is_vector_v<T>; |
80 | }; |
81 | // Meant to match std::numeric_limits interface. |
82 | // NOLINTNEXTLINE(readability-identifier-naming) |
83 | template <typename T> constexpr bool is_array_v = is_array<T>::value; |
84 | |
85 | // Meant to match std::numeric_limits interface. |
86 | // NOLINTBEGIN(readability-identifier-naming) |
87 | template <typename T> |
88 | constexpr bool is_element_type_v = |
89 | is_scalar_v<T> || is_vector_v<T> || is_array_v<T>; |
90 | // NOLINTEND(readability-identifier-naming) |
91 | |
92 | // Helper struct to retrieve the number of elements of an array. |
93 | template <class T> struct array_size {}; |
94 | template <class T, size_t N> |
95 | struct array_size<cpp::array<T, N>> : cpp::integral_constant<size_t, N> {}; |
96 | // Meant to match std::numeric_limits interface. |
97 | // NOLINTNEXTLINE(readability-identifier-naming) |
98 | template <typename T> constexpr size_t array_size_v = array_size<T>::value; |
99 | |
100 | // Generic operations for the above type categories. |
101 | |
102 | template <typename T> T load(CPtr src) { |
103 | static_assert(is_element_type_v<T>); |
104 | if constexpr (is_scalar_v<T> || is_vector_v<T>) { |
105 | return ::LIBC_NAMESPACE::load<T>(src); |
106 | } else if constexpr (is_array_v<T>) { |
107 | using value_type = typename T::value_type; |
108 | T value; |
109 | for (size_t i = 0; i < array_size_v<T>; ++i) |
110 | value[i] = load<value_type>(src + (i * sizeof(value_type))); |
111 | return value; |
112 | } |
113 | } |
114 | |
115 | template <typename T> void store(Ptr dst, T value) { |
116 | static_assert(is_element_type_v<T>); |
117 | if constexpr (is_scalar_v<T> || is_vector_v<T>) { |
118 | ::LIBC_NAMESPACE::store<T>(dst, value); |
119 | } else if constexpr (is_array_v<T>) { |
120 | using value_type = typename T::value_type; |
121 | for (size_t i = 0; i < array_size_v<T>; ++i) |
122 | store<value_type>(dst + (i * sizeof(value_type)), value[i]); |
123 | } |
124 | } |
125 | |
126 | template <typename T> T splat(uint8_t value) { |
127 | static_assert(is_scalar_v<T> || is_vector_v<T>); |
128 | if constexpr (is_scalar_v<T>) |
129 | return T(~0) / T(0xFF) * T(value); |
130 | else if constexpr (is_vector_v<T>) { |
131 | T out; |
132 | // This for loop is optimized out for vector types. |
133 | for (size_t i = 0; i < sizeof(T); ++i) |
134 | out[i] = value; |
135 | return out; |
136 | } |
137 | } |
138 | |
139 | /////////////////////////////////////////////////////////////////////////////// |
140 | // Memset |
141 | /////////////////////////////////////////////////////////////////////////////// |
142 | |
143 | template <typename T> struct Memset { |
144 | static_assert(is_element_type_v<T>); |
145 | static constexpr size_t SIZE = sizeof(T); |
146 | |
147 | LIBC_INLINE static void block(Ptr dst, uint8_t value) { |
148 | if constexpr (is_scalar_v<T> || is_vector_v<T>) { |
149 | store<T>(dst, splat<T>(value)); |
150 | } else if constexpr (is_array_v<T>) { |
151 | using value_type = typename T::value_type; |
152 | const auto Splat = splat<value_type>(value); |
153 | for (size_t i = 0; i < array_size_v<T>; ++i) |
154 | store<value_type>(dst + (i * sizeof(value_type)), Splat); |
155 | } |
156 | } |
157 | |
158 | LIBC_INLINE static void tail(Ptr dst, uint8_t value, size_t count) { |
159 | block(dst: dst + count - SIZE, value); |
160 | } |
161 | |
162 | LIBC_INLINE static void head_tail(Ptr dst, uint8_t value, size_t count) { |
163 | block(dst, value); |
164 | tail(dst, value, count); |
165 | } |
166 | |
167 | LIBC_INLINE static void loop_and_tail_offset(Ptr dst, uint8_t value, |
168 | size_t count, size_t offset) { |
169 | static_assert(SIZE > 1, "a loop of size 1 does not need tail" ); |
170 | do { |
171 | block(dst: dst + offset, value); |
172 | offset += SIZE; |
173 | } while (offset < count - SIZE); |
174 | tail(dst, value, count); |
175 | } |
176 | |
177 | LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) { |
178 | return loop_and_tail_offset(dst, value, count, offset: 0); |
179 | } |
180 | }; |
181 | |
182 | template <typename T, typename... TS> struct MemsetSequence { |
183 | static constexpr size_t SIZE = (sizeof(T) + ... + sizeof(TS)); |
184 | LIBC_INLINE static void block(Ptr dst, uint8_t value) { |
185 | Memset<T>::block(dst, value); |
186 | if constexpr (sizeof...(TS) > 0) |
187 | return MemsetSequence<TS...>::block(dst + sizeof(T), value); |
188 | } |
189 | }; |
190 | |
191 | /////////////////////////////////////////////////////////////////////////////// |
192 | // Memmove |
193 | /////////////////////////////////////////////////////////////////////////////// |
194 | |
195 | template <typename T> struct Memmove { |
196 | static_assert(is_element_type_v<T>); |
197 | static constexpr size_t SIZE = sizeof(T); |
198 | |
199 | LIBC_INLINE static void block(Ptr dst, CPtr src) { |
200 | store<T>(dst, load<T>(src)); |
201 | } |
202 | |
203 | LIBC_INLINE static void head_tail(Ptr dst, CPtr src, size_t count) { |
204 | const size_t offset = count - SIZE; |
205 | // The load and store operations can be performed in any order as long as |
206 | // they are not interleaved. More investigations are needed to determine |
207 | // the best order. |
208 | const auto head = load<T>(src); |
209 | const auto tail = load<T>(src + offset); |
210 | store<T>(dst, head); |
211 | store<T>(dst + offset, tail); |
212 | } |
213 | |
214 | // Align forward suitable when dst < src. The alignment is performed with |
215 | // an HeadTail operation of count ∈ [Alignment, 2 x Alignment]. |
216 | // |
217 | // e.g. Moving two bytes forward, we make sure src is aligned. |
218 | // [ | | | | ] |
219 | // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_] |
220 | // [____LLLLLLLL_____________________] |
221 | // [___________LLLLLLLA______________] |
222 | // [_SSSSSSSS________________________] |
223 | // [________SSSSSSSS_________________] |
224 | // |
225 | // e.g. Moving two bytes forward, we make sure dst is aligned. |
226 | // [ | | | | ] |
227 | // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_] |
228 | // [____LLLLLLLL_____________________] |
229 | // [______LLLLLLLL___________________] |
230 | // [_SSSSSSSS________________________] |
231 | // [___SSSSSSSA______________________] |
232 | template <Arg AlignOn> |
233 | LIBC_INLINE static void align_forward(Ptr &dst, CPtr &src, size_t &count) { |
234 | Ptr prev_dst = dst; |
235 | CPtr prev_src = src; |
236 | size_t prev_count = count; |
237 | align_to_next_boundary<SIZE, AlignOn>(dst, src, count); |
238 | adjust(offset: SIZE, p1&: dst, p2&: src, count); |
239 | head_tail(dst: prev_dst, src: prev_src, count: prev_count - count); |
240 | } |
241 | |
242 | // Align backward suitable when dst > src. The alignment is performed with |
243 | // an HeadTail operation of count ∈ [Alignment, 2 x Alignment]. |
244 | // |
245 | // e.g. Moving two bytes backward, we make sure src is aligned. |
246 | // [ | | | | ] |
247 | // [____XXXXXXXXXXXXXXXXXXXXXXXX_____] |
248 | // [ _________________ALLLLLLL_______] |
249 | // [ ___________________LLLLLLLL_____] |
250 | // [____________________SSSSSSSS_____] |
251 | // [______________________SSSSSSSS___] |
252 | // |
253 | // e.g. Moving two bytes backward, we make sure dst is aligned. |
254 | // [ | | | | ] |
255 | // [____XXXXXXXXXXXXXXXXXXXXXXXX_____] |
256 | // [ _______________LLLLLLLL_________] |
257 | // [ ___________________LLLLLLLL_____] |
258 | // [__________________ASSSSSSS_______] |
259 | // [______________________SSSSSSSS___] |
260 | template <Arg AlignOn> |
261 | LIBC_INLINE static void align_backward(Ptr &dst, CPtr &src, size_t &count) { |
262 | Ptr headtail_dst = dst + count; |
263 | CPtr headtail_src = src + count; |
264 | size_t headtail_size = 0; |
265 | align_to_next_boundary<SIZE, AlignOn>(headtail_dst, headtail_src, |
266 | headtail_size); |
267 | adjust(offset: -2 * SIZE, p1&: headtail_dst, p2&: headtail_src, count&: headtail_size); |
268 | head_tail(dst: headtail_dst, src: headtail_src, count: headtail_size); |
269 | count -= headtail_size; |
270 | } |
271 | |
272 | // Move forward suitable when dst < src. We load the tail bytes before |
273 | // handling the loop. |
274 | // |
275 | // e.g. Moving two bytes |
276 | // [ | | | | |] |
277 | // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___] |
278 | // [_________________________LLLLLLLL___] |
279 | // [___LLLLLLLL_________________________] |
280 | // [_SSSSSSSS___________________________] |
281 | // [___________LLLLLLLL_________________] |
282 | // [_________SSSSSSSS___________________] |
283 | // [___________________LLLLLLLL_________] |
284 | // [_________________SSSSSSSS___________] |
285 | // [_______________________SSSSSSSS_____] |
286 | LIBC_INLINE static void loop_and_tail_forward(Ptr dst, CPtr src, |
287 | size_t count) { |
288 | static_assert(SIZE > 1, "a loop of size 1 does not need tail" ); |
289 | const size_t tail_offset = count - SIZE; |
290 | const auto tail_value = load<T>(src + tail_offset); |
291 | size_t offset = 0; |
292 | LIBC_LOOP_NOUNROLL |
293 | do { |
294 | block(dst: dst + offset, src: src + offset); |
295 | offset += SIZE; |
296 | } while (offset < count - SIZE); |
297 | store<T>(dst + tail_offset, tail_value); |
298 | } |
299 | |
300 | // Move backward suitable when dst > src. We load the head bytes before |
301 | // handling the loop. |
302 | // |
303 | // e.g. Moving two bytes |
304 | // [ | | | | |] |
305 | // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___] |
306 | // [___LLLLLLLL_________________________] |
307 | // [_________________________LLLLLLLL___] |
308 | // [___________________________SSSSSSSS_] |
309 | // [_________________LLLLLLLL___________] |
310 | // [___________________SSSSSSSS_________] |
311 | // [_________LLLLLLLL___________________] |
312 | // [___________SSSSSSSS_________________] |
313 | // [_____SSSSSSSS_______________________] |
314 | LIBC_INLINE static void loop_and_tail_backward(Ptr dst, CPtr src, |
315 | size_t count) { |
316 | static_assert(SIZE > 1, "a loop of size 1 does not need tail" ); |
317 | const auto head_value = load<T>(src); |
318 | ptrdiff_t offset = count - SIZE; |
319 | LIBC_LOOP_NOUNROLL |
320 | do { |
321 | block(dst: dst + offset, src: src + offset); |
322 | offset -= SIZE; |
323 | } while (offset >= 0); |
324 | store<T>(dst, head_value); |
325 | } |
326 | }; |
327 | |
328 | /////////////////////////////////////////////////////////////////////////////// |
329 | // Low level operations for Bcmp and Memcmp that operate on memory locations. |
330 | /////////////////////////////////////////////////////////////////////////////// |
331 | |
332 | // Same as load above but with an offset to the pointer. |
333 | // Making the offset explicit hints the compiler to use relevant addressing mode |
334 | // consistently. |
335 | template <typename T> LIBC_INLINE T load(CPtr ptr, size_t offset) { |
336 | return ::LIBC_NAMESPACE::load<T>(ptr + offset); |
337 | } |
338 | |
339 | // Same as above but also makes sure the loaded value is in big endian format. |
340 | // This is useful when implementing lexicograhic comparisons as big endian |
341 | // scalar comparison directly maps to lexicographic byte comparisons. |
342 | template <typename T> LIBC_INLINE T load_be(CPtr ptr, size_t offset) { |
343 | return Endian::to_big_endian(load<T>(ptr, offset)); |
344 | } |
345 | |
346 | // Equality: returns true iff values at locations (p1 + offset) and (p2 + |
347 | // offset) compare equal. |
348 | template <typename T> LIBC_INLINE bool eq(CPtr p1, CPtr p2, size_t offset); |
349 | |
350 | // Not equals: returns non-zero iff values at locations (p1 + offset) and (p2 + |
351 | // offset) differ. |
352 | template <typename T> LIBC_INLINE uint32_t neq(CPtr p1, CPtr p2, size_t offset); |
353 | |
354 | // Lexicographic comparison: |
355 | // - returns 0 iff values at locations (p1 + offset) and (p2 + offset) compare |
356 | // equal. |
357 | // - returns a negative value if value at location (p1 + offset) is |
358 | // lexicographically less than value at (p2 + offset). |
359 | // - returns a positive value if value at location (p1 + offset) is |
360 | // lexicographically greater than value at (p2 + offset). |
361 | template <typename T> |
362 | LIBC_INLINE MemcmpReturnType cmp(CPtr p1, CPtr p2, size_t offset); |
363 | |
364 | // Lexicographic comparison of non-equal values: |
365 | // - returns a negative value if value at location (p1 + offset) is |
366 | // lexicographically less than value at (p2 + offset). |
367 | // - returns a positive value if value at location (p1 + offset) is |
368 | // lexicographically greater than value at (p2 + offset). |
369 | template <typename T> |
370 | LIBC_INLINE MemcmpReturnType cmp_neq(CPtr p1, CPtr p2, size_t offset); |
371 | |
372 | /////////////////////////////////////////////////////////////////////////////// |
373 | // Memcmp implementation |
374 | // |
375 | // When building memcmp, not all types are considered equals. |
376 | // |
377 | // For instance, the lexicographic comparison of two uint8_t can be implemented |
378 | // as a simple subtraction, but for wider operations the logic can be much more |
379 | // involving, especially on little endian platforms. |
380 | // |
381 | // For such wider types it is a good strategy to test for equality first and |
382 | // only do the expensive lexicographic comparison if necessary. |
383 | // |
384 | // Decomposing the algorithm like this for wider types allows us to have |
385 | // efficient implementation of higher order functions like 'head_tail' or |
386 | // 'loop_and_tail'. |
387 | /////////////////////////////////////////////////////////////////////////////// |
388 | |
389 | // Type traits to decide whether we can use 'cmp' directly or if we need to |
390 | // split the computation. |
391 | template <typename T> struct cmp_is_expensive; |
392 | |
393 | template <typename T> struct Memcmp { |
394 | static_assert(is_element_type_v<T>); |
395 | static constexpr size_t SIZE = sizeof(T); |
396 | |
397 | private: |
398 | LIBC_INLINE static MemcmpReturnType block_offset(CPtr p1, CPtr p2, |
399 | size_t offset) { |
400 | if constexpr (cmp_is_expensive<T>::value) { |
401 | if (!eq<T>(p1, p2, offset)) |
402 | return cmp_neq<T>(p1, p2, offset); |
403 | return MemcmpReturnType::zero(); |
404 | } else { |
405 | return cmp<T>(p1, p2, offset); |
406 | } |
407 | } |
408 | |
409 | public: |
410 | LIBC_INLINE static MemcmpReturnType block(CPtr p1, CPtr p2) { |
411 | return block_offset(p1, p2, offset: 0); |
412 | } |
413 | |
414 | LIBC_INLINE static MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { |
415 | return block_offset(p1, p2, offset: count - SIZE); |
416 | } |
417 | |
418 | LIBC_INLINE static MemcmpReturnType head_tail(CPtr p1, CPtr p2, |
419 | size_t count) { |
420 | if constexpr (cmp_is_expensive<T>::value) { |
421 | if (!eq<T>(p1, p2, 0)) |
422 | return cmp_neq<T>(p1, p2, 0); |
423 | } else { |
424 | if (const auto value = cmp<T>(p1, p2, 0)) |
425 | return value; |
426 | } |
427 | return tail(p1, p2, count); |
428 | } |
429 | |
430 | LIBC_INLINE static MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, |
431 | size_t count) { |
432 | return loop_and_tail_offset(p1, p2, count, offset: 0); |
433 | } |
434 | |
435 | LIBC_INLINE static MemcmpReturnType |
436 | loop_and_tail_offset(CPtr p1, CPtr p2, size_t count, size_t offset) { |
437 | if constexpr (SIZE > 1) { |
438 | const size_t limit = count - SIZE; |
439 | LIBC_LOOP_NOUNROLL |
440 | for (; offset < limit; offset += SIZE) { |
441 | if constexpr (cmp_is_expensive<T>::value) { |
442 | if (!eq<T>(p1, p2, offset)) |
443 | return cmp_neq<T>(p1, p2, offset); |
444 | } else { |
445 | if (const auto value = cmp<T>(p1, p2, offset)) |
446 | return value; |
447 | } |
448 | } |
449 | return block_offset(p1, p2, offset: limit); // tail |
450 | } else { |
451 | // No need for a tail operation when SIZE == 1. |
452 | LIBC_LOOP_NOUNROLL |
453 | for (; offset < count; offset += SIZE) |
454 | if (auto value = cmp<T>(p1, p2, offset)) |
455 | return value; |
456 | return MemcmpReturnType::zero(); |
457 | } |
458 | } |
459 | |
460 | LIBC_INLINE static MemcmpReturnType |
461 | loop_and_tail_align_above(size_t threshold, CPtr p1, CPtr p2, size_t count) { |
462 | const AlignHelper<sizeof(T)> helper(p1); |
463 | if (LIBC_UNLIKELY(count >= threshold) && helper.not_aligned()) { |
464 | if (auto value = block(p1, p2)) |
465 | return value; |
466 | adjust(helper.offset, p1, p2, count); |
467 | } |
468 | return loop_and_tail(p1, p2, count); |
469 | } |
470 | }; |
471 | |
472 | template <typename T, typename... TS> struct MemcmpSequence { |
473 | static constexpr size_t SIZE = (sizeof(T) + ... + sizeof(TS)); |
474 | LIBC_INLINE static MemcmpReturnType block(CPtr p1, CPtr p2) { |
475 | // TODO: test suggestion in |
476 | // https://reviews.llvm.org/D148717?id=515724#inline-1446890 |
477 | // once we have a proper way to check memory operation latency. |
478 | if constexpr (cmp_is_expensive<T>::value) { |
479 | if (!eq<T>(p1, p2, 0)) |
480 | return cmp_neq<T>(p1, p2, 0); |
481 | } else { |
482 | if (auto value = cmp<T>(p1, p2, 0)) |
483 | return value; |
484 | } |
485 | if constexpr (sizeof...(TS) > 0) |
486 | return MemcmpSequence<TS...>::block(p1 + sizeof(T), p2 + sizeof(T)); |
487 | else |
488 | return MemcmpReturnType::zero(); |
489 | } |
490 | }; |
491 | |
492 | /////////////////////////////////////////////////////////////////////////////// |
493 | // Bcmp |
494 | /////////////////////////////////////////////////////////////////////////////// |
495 | template <typename T> struct Bcmp { |
496 | static_assert(is_element_type_v<T>); |
497 | static constexpr size_t SIZE = sizeof(T); |
498 | |
499 | LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) { |
500 | return neq<T>(p1, p2, 0); |
501 | } |
502 | |
503 | LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { |
504 | const size_t tail_offset = count - SIZE; |
505 | return neq<T>(p1, p2, tail_offset); |
506 | } |
507 | |
508 | LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { |
509 | if (const auto value = neq<T>(p1, p2, 0)) |
510 | return value; |
511 | return tail(p1, p2, count); |
512 | } |
513 | |
514 | LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, |
515 | size_t count) { |
516 | return loop_and_tail_offset(p1, p2, count, offset: 0); |
517 | } |
518 | |
519 | LIBC_INLINE static BcmpReturnType |
520 | loop_and_tail_offset(CPtr p1, CPtr p2, size_t count, size_t offset) { |
521 | if constexpr (SIZE > 1) { |
522 | const size_t limit = count - SIZE; |
523 | LIBC_LOOP_NOUNROLL |
524 | for (; offset < limit; offset += SIZE) |
525 | if (const auto value = neq<T>(p1, p2, offset)) |
526 | return value; |
527 | return tail(p1, p2, count); |
528 | } else { |
529 | // No need for a tail operation when SIZE == 1. |
530 | LIBC_LOOP_NOUNROLL |
531 | for (; offset < count; offset += SIZE) |
532 | if (const auto value = neq<T>(p1, p2, offset)) |
533 | return value; |
534 | return BcmpReturnType::zero(); |
535 | } |
536 | } |
537 | |
538 | LIBC_INLINE static BcmpReturnType |
539 | loop_and_tail_align_above(size_t threshold, CPtr p1, CPtr p2, size_t count) { |
540 | static_assert(SIZE > 1, |
541 | "No need to align when processing one byte at a time" ); |
542 | const AlignHelper<sizeof(T)> helper(p1); |
543 | if (LIBC_UNLIKELY(count >= threshold) && helper.not_aligned()) { |
544 | if (auto value = block(p1, p2)) |
545 | return value; |
546 | adjust(helper.offset, p1, p2, count); |
547 | } |
548 | return loop_and_tail(p1, p2, count); |
549 | } |
550 | }; |
551 | |
552 | template <typename T, typename... TS> struct BcmpSequence { |
553 | static constexpr size_t SIZE = (sizeof(T) + ... + sizeof(TS)); |
554 | LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) { |
555 | if (auto value = neq<T>(p1, p2, 0)) |
556 | return value; |
557 | if constexpr (sizeof...(TS) > 0) |
558 | return BcmpSequence<TS...>::block(p1 + sizeof(T), p2 + sizeof(T)); |
559 | else |
560 | return BcmpReturnType::zero(); |
561 | } |
562 | }; |
563 | |
564 | /////////////////////////////////////////////////////////////////////////////// |
565 | // Specializations for uint8_t |
566 | template <> struct cmp_is_expensive<uint8_t> : public cpp::false_type {}; |
567 | template <> LIBC_INLINE bool eq<uint8_t>(CPtr p1, CPtr p2, size_t offset) { |
568 | return load<uint8_t>(ptr: p1, offset) == load<uint8_t>(ptr: p2, offset); |
569 | } |
570 | template <> LIBC_INLINE uint32_t neq<uint8_t>(CPtr p1, CPtr p2, size_t offset) { |
571 | return load<uint8_t>(ptr: p1, offset) ^ load<uint8_t>(ptr: p2, offset); |
572 | } |
573 | template <> |
574 | LIBC_INLINE MemcmpReturnType cmp<uint8_t>(CPtr p1, CPtr p2, size_t offset) { |
575 | return static_cast<int32_t>(load<uint8_t>(ptr: p1, offset)) - |
576 | static_cast<int32_t>(load<uint8_t>(ptr: p2, offset)); |
577 | } |
578 | template <> |
579 | LIBC_INLINE MemcmpReturnType cmp_neq<uint8_t>(CPtr p1, CPtr p2, size_t offset); |
580 | |
581 | } // namespace LIBC_NAMESPACE::generic |
582 | |
583 | #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H |
584 | |