Warning: This file is not a C or C++ file. It does not have highlighting.
1 | //===-- x86 implementation of memory function building blocks -------------===// |
---|---|
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file provides x86 specific building blocks to compose memory functions. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H |
13 | #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H |
14 | |
15 | #include "src/__support/macros/attributes.h" // LIBC_INLINE |
16 | #include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL |
17 | #include "src/__support/macros/properties/architectures.h" |
18 | |
19 | #if defined(LIBC_TARGET_ARCH_IS_X86) |
20 | |
21 | #include "src/__support/common.h" |
22 | #include "src/string/memory_utils/op_builtin.h" |
23 | #include "src/string/memory_utils/op_generic.h" |
24 | |
25 | #if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__AVX2__) || \ |
26 | defined(__SSE2__) |
27 | #include <immintrin.h> |
28 | #endif |
29 | |
30 | // Define fake functions to prevent the compiler from failing on undefined |
31 | // functions in case the CPU extension is not present. |
32 | #if !defined(__AVX512BW__) && (defined(_MSC_VER) || defined(__SCE__)) |
33 | #undef _mm512_cmpneq_epi8_mask |
34 | #define _mm512_cmpneq_epi8_mask(A, B) 0 |
35 | #endif |
36 | #if !defined(__AVX2__) && (defined(_MSC_VER) || defined(__SCE__)) |
37 | #undef _mm256_movemask_epi8 |
38 | #define _mm256_movemask_epi8(A) 0 |
39 | #endif |
40 | #if !defined(__SSE2__) && (defined(_MSC_VER) || defined(__SCE__)) |
41 | #undef _mm_movemask_epi8 |
42 | #define _mm_movemask_epi8(A) 0 |
43 | #endif |
44 | |
45 | namespace LIBC_NAMESPACE_DECL { |
46 | namespace x86 { |
47 | |
48 | // A set of constants to check compile time features. |
49 | LIBC_INLINE_VAR constexpr bool K_SSE2 = LLVM_LIBC_IS_DEFINED(__SSE2__); |
50 | LIBC_INLINE_VAR constexpr bool K_SSE41 = LLVM_LIBC_IS_DEFINED(__SSE4_1__); |
51 | LIBC_INLINE_VAR constexpr bool K_AVX = LLVM_LIBC_IS_DEFINED(__AVX__); |
52 | LIBC_INLINE_VAR constexpr bool K_AVX2 = LLVM_LIBC_IS_DEFINED(__AVX2__); |
53 | LIBC_INLINE_VAR constexpr bool K_AVX512_F = LLVM_LIBC_IS_DEFINED(__AVX512F__); |
54 | LIBC_INLINE_VAR constexpr bool K_AVX512_BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__); |
55 | |
56 | /////////////////////////////////////////////////////////////////////////////// |
57 | // Memcpy repmovsb implementation |
58 | struct Memcpy { |
59 | LIBC_INLINE static void repmovsb(void *dst, const void *src, size_t count) { |
60 | asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); |
61 | } |
62 | }; |
63 | |
64 | } // namespace x86 |
65 | } // namespace LIBC_NAMESPACE_DECL |
66 | |
67 | namespace LIBC_NAMESPACE_DECL { |
68 | namespace generic { |
69 | |
70 | // Not equals: returns non-zero iff values at head or tail differ. |
71 | // This function typically loads more data than necessary when the two buffer |
72 | // differs. |
73 | template <typename T> |
74 | LIBC_INLINE uint32_t branchless_head_tail_neq(CPtr p1, CPtr p2, size_t count) { |
75 | static_assert(cpp::is_integral_v<T>); |
76 | return neq<T>(p1, p2, 0) | neq<T>(p1, p2, count - sizeof(T)); |
77 | } |
78 | |
79 | /////////////////////////////////////////////////////////////////////////////// |
80 | // Specializations for uint16_t |
81 | template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {}; |
82 | template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) { |
83 | return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset); |
84 | } |
85 | template <> |
86 | LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) { |
87 | return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset); |
88 | } |
89 | template <> |
90 | LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) { |
91 | return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) - |
92 | static_cast<int32_t>(load_be<uint16_t>(p2, offset)); |
93 | } |
94 | template <> |
95 | LIBC_INLINE MemcmpReturnType cmp_neq<uint16_t>(CPtr p1, CPtr p2, size_t offset); |
96 | |
97 | /////////////////////////////////////////////////////////////////////////////// |
98 | // Specializations for uint32_t |
99 | template <> struct cmp_is_expensive<uint32_t> : public cpp::false_type {}; |
100 | template <> LIBC_INLINE bool eq<uint32_t>(CPtr p1, CPtr p2, size_t offset) { |
101 | return load<uint32_t>(p1, offset) == load<uint32_t>(p2, offset); |
102 | } |
103 | template <> |
104 | LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) { |
105 | return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset); |
106 | } |
107 | template <> |
108 | LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) { |
109 | const auto a = load_be<uint32_t>(p1, offset); |
110 | const auto b = load_be<uint32_t>(p2, offset); |
111 | return cmp_uint32_t(a, b); |
112 | } |
113 | template <> |
114 | LIBC_INLINE MemcmpReturnType cmp_neq<uint32_t>(CPtr p1, CPtr p2, size_t offset); |
115 | |
116 | /////////////////////////////////////////////////////////////////////////////// |
117 | // Specializations for uint64_t |
118 | template <> struct cmp_is_expensive<uint64_t> : public cpp::true_type {}; |
119 | template <> LIBC_INLINE bool eq<uint64_t>(CPtr p1, CPtr p2, size_t offset) { |
120 | return load<uint64_t>(p1, offset) == load<uint64_t>(p2, offset); |
121 | } |
122 | template <> |
123 | LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) { |
124 | return !eq<uint64_t>(p1, p2, offset); |
125 | } |
126 | template <> |
127 | LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset); |
128 | template <> |
129 | LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2, |
130 | size_t offset) { |
131 | const auto a = load_be<uint64_t>(p1, offset); |
132 | const auto b = load_be<uint64_t>(p2, offset); |
133 | return cmp_neq_uint64_t(a, b); |
134 | } |
135 | |
136 | // SIMD types are defined with attributes. e.g., '__m128i' is defined as |
137 | // long long __attribute__((__vector_size__(16), __aligned__(16))) |
138 | // When we use these SIMD types in template specialization GCC complains: |
139 | // "ignoring attributes on template argument ā__m128iā [-Wignored-attributes]" |
140 | // Therefore, we disable this warning in this file. |
141 | #pragma GCC diagnostic push |
142 | #pragma GCC diagnostic ignored "-Wignored-attributes" |
143 | |
144 | /////////////////////////////////////////////////////////////////////////////// |
145 | // Specializations for __m128i |
146 | #if defined(__SSE4_1__) |
147 | template <> struct is_vector<__m128i> : cpp::true_type {}; |
148 | template <> struct cmp_is_expensive<__m128i> : cpp::true_type {}; |
149 | LIBC_INLINE __m128i load_and_xor_m128i(CPtr p1, CPtr p2, size_t offset) { |
150 | const auto a = load<__m128i>(p1, offset); |
151 | const auto b = load<__m128i>(p2, offset); |
152 | return _mm_xor_si128(a, b); |
153 | } |
154 | LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) { |
155 | return _mm_max_epu8(a, b); |
156 | } |
157 | LIBC_INLINE __m128i bytewise_reverse(__m128i value) { |
158 | return _mm_shuffle_epi8(value, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // |
159 | 8, 9, 10, 11, 12, 13, 14, 15)); |
160 | } |
161 | LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) { |
162 | return static_cast<uint16_t>( |
163 | _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value)))); |
164 | } |
165 | LIBC_INLINE bool is_zero(__m128i value) { |
166 | return _mm_testz_si128(value, value) == 1; |
167 | } |
168 | template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) { |
169 | return is_zero(load_and_xor_m128i(p1, p2, offset)); |
170 | } |
171 | template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) { |
172 | return !is_zero(load_and_xor_m128i(p1, p2, offset)); |
173 | } |
174 | template <> |
175 | LIBC_INLINE uint32_t branchless_head_tail_neq<__m128i>(CPtr p1, CPtr p2, |
176 | size_t count) { |
177 | const __m128i head = load_and_xor_m128i(p1, p2, 0); |
178 | const __m128i tail = load_and_xor_m128i(p1, p2, count - sizeof(__m128i)); |
179 | return !is_zero(_mm_or_si128(head, tail)); |
180 | } |
181 | template <> |
182 | LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) { |
183 | const auto a = load<__m128i>(p1, offset); |
184 | const auto b = load<__m128i>(p2, offset); |
185 | const auto vmax = bytewise_max(a, b); |
186 | const auto le = big_endian_cmp_mask(vmax, b); |
187 | const auto ge = big_endian_cmp_mask(vmax, a); |
188 | static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint16_t>); |
189 | return static_cast<int32_t>(ge) - static_cast<int32_t>(le); |
190 | } |
191 | #endif // __SSE4_1__ |
192 | |
193 | /////////////////////////////////////////////////////////////////////////////// |
194 | // Specializations for __m256i |
195 | #if defined(__AVX__) |
196 | template <> struct is_vector<__m256i> : cpp::true_type {}; |
197 | template <> struct cmp_is_expensive<__m256i> : cpp::true_type {}; |
198 | LIBC_INLINE __m256i xor_m256i(__m256i a, __m256i b) { |
199 | return _mm256_castps_si256( |
200 | _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); |
201 | } |
202 | LIBC_INLINE __m256i or_m256i(__m256i a, __m256i b) { |
203 | return _mm256_castps_si256( |
204 | _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); |
205 | } |
206 | LIBC_INLINE __m256i load_and_xor_m256i(CPtr p1, CPtr p2, size_t offset) { |
207 | const auto a = load<__m256i>(p1, offset); |
208 | const auto b = load<__m256i>(p2, offset); |
209 | return xor_m256i(a, b); |
210 | } |
211 | LIBC_INLINE bool is_zero(__m256i value) { |
212 | return _mm256_testz_si256(value, value) == 1; |
213 | } |
214 | template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) { |
215 | return is_zero(load_and_xor_m256i(p1, p2, offset)); |
216 | } |
217 | template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) { |
218 | return !is_zero(load_and_xor_m256i(p1, p2, offset)); |
219 | } |
220 | template <> |
221 | LIBC_INLINE uint32_t branchless_head_tail_neq<__m256i>(CPtr p1, CPtr p2, |
222 | size_t count) { |
223 | const __m256i head = load_and_xor_m256i(p1, p2, 0); |
224 | const __m256i tail = load_and_xor_m256i(p1, p2, count - sizeof(__m256i)); |
225 | return !is_zero(or_m256i(head, tail)); |
226 | } |
227 | #endif // __AVX__ |
228 | |
229 | #if defined(__AVX2__) |
230 | LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) { |
231 | return _mm256_max_epu8(a, b); |
232 | } |
233 | LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) { |
234 | // Bytewise comparison of 'max' and 'value'. |
235 | const __m256i little_endian_byte_mask = _mm256_cmpeq_epi8(max, value); |
236 | // Because x86 is little endian, bytes in the vector must be reversed before |
237 | // using movemask. |
238 | #if defined(__AVX512VBMI__) && defined(__AVX512VL__) |
239 | // When AVX512BMI is available we can completely reverse the vector through |
240 | // VPERMB __m256i _mm256_permutexvar_epi8( __m256i idx, __m256i a); |
241 | const __m256i big_endian_byte_mask = |
242 | _mm256_permutexvar_epi8(_mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // |
243 | 8, 9, 10, 11, 12, 13, 14, 15, // |
244 | 16, 17, 18, 19, 20, 21, 22, 23, // |
245 | 24, 25, 26, 27, 28, 29, 30, 31), |
246 | little_endian_byte_mask); |
247 | // And turn the byte vector mask into an 'uint32_t' for direct scalar |
248 | // comparison. |
249 | return _mm256_movemask_epi8(big_endian_byte_mask); |
250 | #else |
251 | // We can't byte-reverse '__m256i' in a single instruction with AVX2. |
252 | // '_mm256_shuffle_epi8' can only shuffle within each 16-byte lane |
253 | // leading to: |
254 | // ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, |
255 | // 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16] |
256 | // So we first shuffle each 16-byte lane leading to half-reversed vector mask. |
257 | const __m256i half_reversed = _mm256_shuffle_epi8( |
258 | little_endian_byte_mask, _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // |
259 | 8, 9, 10, 11, 12, 13, 14, 15, // |
260 | 0, 1, 2, 3, 4, 5, 6, 7, // |
261 | 8, 9, 10, 11, 12, 13, 14, 15)); |
262 | // Then we turn the vector into an uint32_t. |
263 | const uint32_t half_reversed_scalar = _mm256_movemask_epi8(half_reversed); |
264 | // And swap the lower and upper parts. This is optimized into a single `rorx` |
265 | // instruction. |
266 | return (half_reversed_scalar << 16) | (half_reversed_scalar >> 16); |
267 | #endif |
268 | } |
269 | template <> |
270 | LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) { |
271 | const auto a = load<__m256i>(p1, offset); |
272 | const auto b = load<__m256i>(p2, offset); |
273 | const auto vmax = bytewise_max(a, b); |
274 | const auto le = big_endian_cmp_mask(vmax, b); |
275 | const auto ge = big_endian_cmp_mask(vmax, a); |
276 | static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint32_t>); |
277 | return cmp_neq_uint64_t(ge, le); |
278 | } |
279 | #endif // __AVX2__ |
280 | |
281 | /////////////////////////////////////////////////////////////////////////////// |
282 | // Specializations for __m512i |
283 | #if defined(__AVX512BW__) |
284 | template <> struct is_vector<__m512i> : cpp::true_type {}; |
285 | template <> struct cmp_is_expensive<__m512i> : cpp::true_type {}; |
286 | LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) { |
287 | return _mm512_max_epu8(a, b); |
288 | } |
289 | LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) { |
290 | // The AVX512BMI version is disabled due to bad codegen. |
291 | // https://github.com/llvm/llvm-project/issues/77459 |
292 | // https://github.com/llvm/llvm-project/pull/77081 |
293 | // TODO: Re-enable when clang version meets the fixed version. |
294 | #if false && defined(__AVX512VBMI__) |
295 | // When AVX512BMI is available we can completely reverse the vector through |
296 | // VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a); |
297 | const auto indices = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // |
298 | 8, 9, 10, 11, 12, 13, 14, 15, // |
299 | 16, 17, 18, 19, 20, 21, 22, 23, // |
300 | 24, 25, 26, 27, 28, 29, 30, 31, // |
301 | 32, 33, 34, 35, 36, 37, 38, 39, // |
302 | 40, 41, 42, 43, 44, 45, 46, 47, // |
303 | 48, 49, 50, 51, 52, 53, 54, 55, // |
304 | 56, 57, 58, 59, 60, 61, 62, 63); |
305 | // Then we compute the mask for equal bytes. |
306 | return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max), // |
307 | _mm512_permutexvar_epi8(indices, value)); |
308 | #else |
309 | // We can't byte-reverse '__m512i' in a single instruction with __AVX512BW__. |
310 | // '_mm512_shuffle_epi8' can only shuffle within each 16-byte lane. |
311 | // So we only reverse groups of 8 bytes, these groups are necessarily within a |
312 | // 16-byte lane. |
313 | // zmm = | 16 bytes | 16 bytes | 16 bytes | 16 bytes | |
314 | // zmm = | <8> | <8> | <8> | <8> | <8> | <8> | <8> | <8> | |
315 | const __m512i indices = _mm512_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, // |
316 | 0, 1, 2, 3, 4, 5, 6, 7, // |
317 | 8, 9, 10, 11, 12, 13, 14, 15, // |
318 | 0, 1, 2, 3, 4, 5, 6, 7, // |
319 | 8, 9, 10, 11, 12, 13, 14, 15, // |
320 | 0, 1, 2, 3, 4, 5, 6, 7, // |
321 | 8, 9, 10, 11, 12, 13, 14, 15, // |
322 | 0, 1, 2, 3, 4, 5, 6, 7); |
323 | // Then we compute the mask for equal bytes. In this mask the bits of each |
324 | // byte are already reversed but the byte themselves should be reversed, this |
325 | // is done by using a bswap instruction. |
326 | return __builtin_bswap64( |
327 | _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(max, indices), // |
328 | _mm512_shuffle_epi8(value, indices))); |
329 | |
330 | #endif |
331 | } |
332 | template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) { |
333 | const auto a = load<__m512i>(p1, offset); |
334 | const auto b = load<__m512i>(p2, offset); |
335 | return _mm512_cmpneq_epi8_mask(a, b) == 0; |
336 | } |
337 | template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) { |
338 | const auto a = load<__m512i>(p1, offset); |
339 | const auto b = load<__m512i>(p2, offset); |
340 | return _mm512_cmpneq_epi8_mask(a, b) != 0; |
341 | } |
342 | LIBC_INLINE __m512i load_and_xor_m512i(CPtr p1, CPtr p2, size_t offset) { |
343 | const auto a = load<__m512i>(p1, offset); |
344 | const auto b = load<__m512i>(p2, offset); |
345 | return _mm512_xor_epi64(a, b); |
346 | } |
347 | LIBC_INLINE bool is_zero(__m512i value) { |
348 | return _mm512_test_epi32_mask(value, value) == 0; |
349 | } |
350 | template <> |
351 | LIBC_INLINE uint32_t branchless_head_tail_neq<__m512i>(CPtr p1, CPtr p2, |
352 | size_t count) { |
353 | const __m512i head = load_and_xor_m512i(p1, p2, 0); |
354 | const __m512i tail = load_and_xor_m512i(p1, p2, count - sizeof(__m512i)); |
355 | return !is_zero(_mm512_or_epi64(head, tail)); |
356 | } |
357 | template <> |
358 | LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) { |
359 | const auto a = load<__m512i>(p1, offset); |
360 | const auto b = load<__m512i>(p2, offset); |
361 | const auto vmax = bytewise_max(a, b); |
362 | const auto le = big_endian_cmp_mask(vmax, b); |
363 | const auto ge = big_endian_cmp_mask(vmax, a); |
364 | static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint64_t>); |
365 | return cmp_neq_uint64_t(ge, le); |
366 | } |
367 | #endif // __AVX512BW__ |
368 | |
369 | #pragma GCC diagnostic pop |
370 | |
371 | } // namespace generic |
372 | } // namespace LIBC_NAMESPACE_DECL |
373 | |
374 | #endif // LIBC_TARGET_ARCH_IS_X86 |
375 | |
376 | #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H |
377 |
Warning: This file is not a C or C++ file. It does not have highlighting.