op_x86.h source code [libc/src/string/memory_utils/op_x86.h]

Warning: This file is not a C or C++ file. It does not have highlighting.

1	//===-- x86 implementation of memory function building blocks -------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file provides x86 specific building blocks to compose memory functions.
10	//
11	//===----------------------------------------------------------------------===//
12	#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
13	#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
14
15	#include "src/__support/macros/attributes.h" // LIBC_INLINE
16	#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL
17	#include "src/__support/macros/properties/architectures.h"
18
19	#if defined(LIBC_TARGET_ARCH_IS_X86)
20
21	#include "src/__support/common.h"
22	#include "src/string/memory_utils/op_builtin.h"
23	#include "src/string/memory_utils/op_generic.h"
24
25	#if defined(__AVX512BW__) \|\| defined(__AVX512F__) \|\| defined(__AVX2__) \|\| \
26	defined(__SSE2__)
27	#include <immintrin.h>
28	#endif
29
30	// Define fake functions to prevent the compiler from failing on undefined
31	// functions in case the CPU extension is not present.
32	#if !defined(__AVX512BW__) && (defined(_MSC_VER) \|\| defined(__SCE__))
33	#undef _mm512_cmpneq_epi8_mask
34	#define _mm512_cmpneq_epi8_mask(A, B) 0
35	#endif
36	#if !defined(__AVX2__) && (defined(_MSC_VER) \|\| defined(__SCE__))
37	#undef _mm256_movemask_epi8
38	#define _mm256_movemask_epi8(A) 0
39	#endif
40	#if !defined(__SSE2__) && (defined(_MSC_VER) \|\| defined(__SCE__))
41	#undef _mm_movemask_epi8
42	#define _mm_movemask_epi8(A) 0
43	#endif
44
45	namespace LIBC_NAMESPACE_DECL {
46	namespace x86 {
47
48	// A set of constants to check compile time features.
49	LIBC_INLINE_VAR constexpr bool K_SSE2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
50	LIBC_INLINE_VAR constexpr bool K_SSE41 = LLVM_LIBC_IS_DEFINED(__SSE4_1__);
51	LIBC_INLINE_VAR constexpr bool K_AVX = LLVM_LIBC_IS_DEFINED(__AVX__);
52	LIBC_INLINE_VAR constexpr bool K_AVX2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
53	LIBC_INLINE_VAR constexpr bool K_AVX512_F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
54	LIBC_INLINE_VAR constexpr bool K_AVX512_BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
55
56	///////////////////////////////////////////////////////////////////////////////
57	// Memcpy repmovsb implementation
58	struct Memcpy {
59	LIBC_INLINE static void repmovsb(void dst, const void src, size_t count) {
60	asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
61	}
62	};
63
64	} // namespace x86
65	} // namespace LIBC_NAMESPACE_DECL
66
67	namespace LIBC_NAMESPACE_DECL {
68	namespace generic {
69
70	// Not equals: returns non-zero iff values at head or tail differ.
71	// This function typically loads more data than necessary when the two buffer
72	// differs.
73	template <typename T>
74	LIBC_INLINE uint32_t branchless_head_tail_neq(CPtr p1, CPtr p2, size_t count) {
75	static_assert(cpp::is_integral_v<T>);
76	return neq<T>(p1, p2, 0) \| neq<T>(p1, p2, count - sizeof(T));
77	}
78
79	///////////////////////////////////////////////////////////////////////////////
80	// Specializations for uint16_t
81	template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
82	template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
83	return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
84	}
85	template <>
86	LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
87	return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
88	}
89	template <>
90	LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
91	return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
92	static_cast<int32_t>(load_be<uint16_t>(p2, offset));
93	}
94	template <>
95	LIBC_INLINE MemcmpReturnType cmp_neq<uint16_t>(CPtr p1, CPtr p2, size_t offset);
96
97	///////////////////////////////////////////////////////////////////////////////
98	// Specializations for uint32_t
99	template <> struct cmp_is_expensive<uint32_t> : public cpp::false_type {};
100	template <> LIBC_INLINE bool eq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
101	return load<uint32_t>(p1, offset) == load<uint32_t>(p2, offset);
102	}
103	template <>
104	LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
105	return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
106	}
107	template <>
108	LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
109	const auto a = load_be<uint32_t>(p1, offset);
110	const auto b = load_be<uint32_t>(p2, offset);
111	return cmp_uint32_t(a, b);
112	}
113	template <>
114	LIBC_INLINE MemcmpReturnType cmp_neq<uint32_t>(CPtr p1, CPtr p2, size_t offset);
115
116	///////////////////////////////////////////////////////////////////////////////
117	// Specializations for uint64_t
118	template <> struct cmp_is_expensive<uint64_t> : public cpp::true_type {};
119	template <> LIBC_INLINE bool eq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
120	return load<uint64_t>(p1, offset) == load<uint64_t>(p2, offset);
121	}
122	template <>
123	LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
124	return !eq<uint64_t>(p1, p2, offset);
125	}
126	template <>
127	LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset);
128	template <>
129	LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
130	size_t offset) {
131	const auto a = load_be<uint64_t>(p1, offset);
132	const auto b = load_be<uint64_t>(p2, offset);
133	return cmp_neq_uint64_t(a, b);
134	}
135
136	// SIMD types are defined with attributes. e.g., '__m128i' is defined as
137	// long long __attribute__((__vector_size__(16), __aligned__(16)))
138	// When we use these SIMD types in template specialization GCC complains:
139	// "ignoring attributes on template argument ‘__m128i’ [-Wignored-attributes]"
140	// Therefore, we disable this warning in this file.
141	#pragma GCC diagnostic push
142	#pragma GCC diagnostic ignored "-Wignored-attributes"
143
144	///////////////////////////////////////////////////////////////////////////////
145	// Specializations for __m128i
146	#if defined(__SSE4_1__)
147	template <> struct is_vector<__m128i> : cpp::true_type {};
148	template <> struct cmp_is_expensive<__m128i> : cpp::true_type {};
149	LIBC_INLINE __m128i load_and_xor_m128i(CPtr p1, CPtr p2, size_t offset) {
150	const auto a = load<__m128i>(p1, offset);
151	const auto b = load<__m128i>(p2, offset);
152	return _mm_xor_si128(a, b);
153	}
154	LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) {
155	return _mm_max_epu8(a, b);
156	}
157	LIBC_INLINE __m128i bytewise_reverse(__m128i value) {
158	return _mm_shuffle_epi8(value, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
159	8, 9, 10, 11, 12, 13, 14, 15));
160	}
161	LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
162	return static_cast<uint16_t>(
163	_mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
164	}
165	LIBC_INLINE bool is_zero(__m128i value) {
166	return _mm_testz_si128(value, value) == 1;
167	}
168	template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
169	return is_zero(load_and_xor_m128i(p1, p2, offset));
170	}
171	template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
172	return !is_zero(load_and_xor_m128i(p1, p2, offset));
173	}
174	template <>
175	LIBC_INLINE uint32_t branchless_head_tail_neq<__m128i>(CPtr p1, CPtr p2,
176	size_t count) {
177	const __m128i head = load_and_xor_m128i(p1, p2, 0);
178	const __m128i tail = load_and_xor_m128i(p1, p2, count - sizeof(__m128i));
179	return !is_zero(_mm_or_si128(head, tail));
180	}
181	template <>
182	LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
183	const auto a = load<__m128i>(p1, offset);
184	const auto b = load<__m128i>(p2, offset);
185	const auto vmax = bytewise_max(a, b);
186	const auto le = big_endian_cmp_mask(vmax, b);
187	const auto ge = big_endian_cmp_mask(vmax, a);
188	static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint16_t>);
189	return static_cast<int32_t>(ge) - static_cast<int32_t>(le);
190	}
191	#endif // __SSE4_1__
192
193	///////////////////////////////////////////////////////////////////////////////
194	// Specializations for __m256i
195	#if defined(__AVX__)
196	template <> struct is_vector<__m256i> : cpp::true_type {};
197	template <> struct cmp_is_expensive<__m256i> : cpp::true_type {};
198	LIBC_INLINE __m256i xor_m256i(__m256i a, __m256i b) {
199	return _mm256_castps_si256(
200	_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
201	}
202	LIBC_INLINE __m256i or_m256i(__m256i a, __m256i b) {
203	return _mm256_castps_si256(
204	_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
205	}
206	LIBC_INLINE __m256i load_and_xor_m256i(CPtr p1, CPtr p2, size_t offset) {
207	const auto a = load<__m256i>(p1, offset);
208	const auto b = load<__m256i>(p2, offset);
209	return xor_m256i(a, b);
210	}
211	LIBC_INLINE bool is_zero(__m256i value) {
212	return _mm256_testz_si256(value, value) == 1;
213	}
214	template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
215	return is_zero(load_and_xor_m256i(p1, p2, offset));
216	}
217	template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
218	return !is_zero(load_and_xor_m256i(p1, p2, offset));
219	}
220	template <>
221	LIBC_INLINE uint32_t branchless_head_tail_neq<__m256i>(CPtr p1, CPtr p2,
222	size_t count) {
223	const __m256i head = load_and_xor_m256i(p1, p2, 0);
224	const __m256i tail = load_and_xor_m256i(p1, p2, count - sizeof(__m256i));
225	return !is_zero(or_m256i(head, tail));
226	}
227	#endif // __AVX__
228
229	#if defined(__AVX2__)
230	LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) {
231	return _mm256_max_epu8(a, b);
232	}
233	LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) {
234	// Bytewise comparison of 'max' and 'value'.
235	const __m256i little_endian_byte_mask = _mm256_cmpeq_epi8(max, value);
236	// Because x86 is little endian, bytes in the vector must be reversed before
237	// using movemask.
238	#if defined(__AVX512VBMI__) && defined(__AVX512VL__)
239	// When AVX512BMI is available we can completely reverse the vector through
240	// VPERMB __m256i _mm256_permutexvar_epi8( __m256i idx, __m256i a);
241	const __m256i big_endian_byte_mask =
242	_mm256_permutexvar_epi8(_mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
243	8, 9, 10, 11, 12, 13, 14, 15, //
244	16, 17, 18, 19, 20, 21, 22, 23, //
245	24, 25, 26, 27, 28, 29, 30, 31),
246	little_endian_byte_mask);
247	// And turn the byte vector mask into an 'uint32_t' for direct scalar
248	// comparison.
249	return _mm256_movemask_epi8(big_endian_byte_mask);
250	#else
251	// We can't byte-reverse '__m256i' in a single instruction with AVX2.
252	// '_mm256_shuffle_epi8' can only shuffle within each 16-byte lane
253	// leading to:
254	// ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
255	// 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16]
256	// So we first shuffle each 16-byte lane leading to half-reversed vector mask.
257	const __m256i half_reversed = _mm256_shuffle_epi8(
258	little_endian_byte_mask, _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
259	8, 9, 10, 11, 12, 13, 14, 15, //
260	0, 1, 2, 3, 4, 5, 6, 7, //
261	8, 9, 10, 11, 12, 13, 14, 15));
262	// Then we turn the vector into an uint32_t.
263	const uint32_t half_reversed_scalar = _mm256_movemask_epi8(half_reversed);
264	// And swap the lower and upper parts. This is optimized into a single `rorx`
265	// instruction.
266	return (half_reversed_scalar << 16) \| (half_reversed_scalar >> 16);
267	#endif
268	}
269	template <>
270	LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
271	const auto a = load<__m256i>(p1, offset);
272	const auto b = load<__m256i>(p2, offset);
273	const auto vmax = bytewise_max(a, b);
274	const auto le = big_endian_cmp_mask(vmax, b);
275	const auto ge = big_endian_cmp_mask(vmax, a);
276	static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint32_t>);
277	return cmp_neq_uint64_t(ge, le);
278	}
279	#endif // __AVX2__
280
281	///////////////////////////////////////////////////////////////////////////////
282	// Specializations for __m512i
283	#if defined(__AVX512BW__)
284	template <> struct is_vector<__m512i> : cpp::true_type {};
285	template <> struct cmp_is_expensive<__m512i> : cpp::true_type {};
286	LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) {
287	return _mm512_max_epu8(a, b);
288	}
289	LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) {
290	// The AVX512BMI version is disabled due to bad codegen.
291	// https://github.com/llvm/llvm-project/issues/77459
292	// https://github.com/llvm/llvm-project/pull/77081
293	// TODO: Re-enable when clang version meets the fixed version.
294	#if false && defined(__AVX512VBMI__)
295	// When AVX512BMI is available we can completely reverse the vector through
296	// VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a);
297	const auto indices = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
298	8, 9, 10, 11, 12, 13, 14, 15, //
299	16, 17, 18, 19, 20, 21, 22, 23, //
300	24, 25, 26, 27, 28, 29, 30, 31, //
301	32, 33, 34, 35, 36, 37, 38, 39, //
302	40, 41, 42, 43, 44, 45, 46, 47, //
303	48, 49, 50, 51, 52, 53, 54, 55, //
304	56, 57, 58, 59, 60, 61, 62, 63);
305	// Then we compute the mask for equal bytes.
306	return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max), //
307	_mm512_permutexvar_epi8(indices, value));
308	#else
309	// We can't byte-reverse '__m512i' in a single instruction with __AVX512BW__.
310	// '_mm512_shuffle_epi8' can only shuffle within each 16-byte lane.
311	// So we only reverse groups of 8 bytes, these groups are necessarily within a
312	// 16-byte lane.
313	// zmm = \| 16 bytes \| 16 bytes \| 16 bytes \| 16 bytes \|
314	// zmm = \| <8> \| <8> \| <8> \| <8> \| <8> \| <8> \| <8> \| <8> \|
315	const __m512i indices = _mm512_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, //
316	0, 1, 2, 3, 4, 5, 6, 7, //
317	8, 9, 10, 11, 12, 13, 14, 15, //
318	0, 1, 2, 3, 4, 5, 6, 7, //
319	8, 9, 10, 11, 12, 13, 14, 15, //
320	0, 1, 2, 3, 4, 5, 6, 7, //
321	8, 9, 10, 11, 12, 13, 14, 15, //
322	0, 1, 2, 3, 4, 5, 6, 7);
323	// Then we compute the mask for equal bytes. In this mask the bits of each
324	// byte are already reversed but the byte themselves should be reversed, this
325	// is done by using a bswap instruction.
326	return __builtin_bswap64(
327	_mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(max, indices), //
328	_mm512_shuffle_epi8(value, indices)));
329
330	#endif
331	}
332	template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
333	const auto a = load<__m512i>(p1, offset);
334	const auto b = load<__m512i>(p2, offset);
335	return _mm512_cmpneq_epi8_mask(a, b) == 0;
336	}
337	template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
338	const auto a = load<__m512i>(p1, offset);
339	const auto b = load<__m512i>(p2, offset);
340	return _mm512_cmpneq_epi8_mask(a, b) != 0;
341	}
342	LIBC_INLINE __m512i load_and_xor_m512i(CPtr p1, CPtr p2, size_t offset) {
343	const auto a = load<__m512i>(p1, offset);
344	const auto b = load<__m512i>(p2, offset);
345	return _mm512_xor_epi64(a, b);
346	}
347	LIBC_INLINE bool is_zero(__m512i value) {
348	return _mm512_test_epi32_mask(value, value) == 0;
349	}
350	template <>
351	LIBC_INLINE uint32_t branchless_head_tail_neq<__m512i>(CPtr p1, CPtr p2,
352	size_t count) {
353	const __m512i head = load_and_xor_m512i(p1, p2, 0);
354	const __m512i tail = load_and_xor_m512i(p1, p2, count - sizeof(__m512i));
355	return !is_zero(_mm512_or_epi64(head, tail));
356	}
357	template <>
358	LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
359	const auto a = load<__m512i>(p1, offset);
360	const auto b = load<__m512i>(p2, offset);
361	const auto vmax = bytewise_max(a, b);
362	const auto le = big_endian_cmp_mask(vmax, b);
363	const auto ge = big_endian_cmp_mask(vmax, a);
364	static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint64_t>);
365	return cmp_neq_uint64_t(ge, le);
366	}
367	#endif // __AVX512BW__
368
369	#pragma GCC diagnostic pop
370
371	} // namespace generic
372	} // namespace LIBC_NAMESPACE_DECL
373
374	#endif // LIBC_TARGET_ARCH_IS_X86
375
376	#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
377

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of libc/src/string/memory_utils/op_x86.h