1 | //===-- x86 implementation of memory function building blocks -------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file provides x86 specific building blocks to compose memory functions. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H |
13 | #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H |
14 | |
15 | #include "src/__support/macros/properties/architectures.h" |
16 | |
17 | #if defined(LIBC_TARGET_ARCH_IS_X86_64) |
18 | |
19 | #include "src/__support/common.h" |
20 | #include "src/string/memory_utils/op_builtin.h" |
21 | #include "src/string/memory_utils/op_generic.h" |
22 | |
23 | #if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__AVX2__) || \ |
24 | defined(__SSE2__) |
25 | #include <immintrin.h> |
26 | #endif |
27 | |
28 | // Define fake functions to prevent the compiler from failing on undefined |
29 | // functions in case the CPU extension is not present. |
30 | #if !defined(__AVX512BW__) && (defined(_MSC_VER) || defined(__SCE__)) |
31 | #define _mm512_cmpneq_epi8_mask(A, B) 0 |
32 | #endif |
33 | #if !defined(__AVX2__) && (defined(_MSC_VER) || defined(__SCE__)) |
34 | #define _mm256_movemask_epi8(A) 0 |
35 | #endif |
36 | #if !defined(__SSE2__) && (defined(_MSC_VER) || defined(__SCE__)) |
37 | #define _mm_movemask_epi8(A) 0 |
38 | #endif |
39 | |
40 | namespace LIBC_NAMESPACE::x86 { |
41 | |
42 | // A set of constants to check compile time features. |
43 | LIBC_INLINE_VAR constexpr bool K_SSE2 = LLVM_LIBC_IS_DEFINED(__SSE2__); |
44 | LIBC_INLINE_VAR constexpr bool K_SSE41 = LLVM_LIBC_IS_DEFINED(__SSE4_1__); |
45 | LIBC_INLINE_VAR constexpr bool K_AVX = LLVM_LIBC_IS_DEFINED(__AVX__); |
46 | LIBC_INLINE_VAR constexpr bool K_AVX2 = LLVM_LIBC_IS_DEFINED(__AVX2__); |
47 | LIBC_INLINE_VAR constexpr bool K_AVX512_F = LLVM_LIBC_IS_DEFINED(__AVX512F__); |
48 | LIBC_INLINE_VAR constexpr bool K_AVX512_BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__); |
49 | |
50 | /////////////////////////////////////////////////////////////////////////////// |
51 | // Memcpy repmovsb implementation |
52 | struct Memcpy { |
53 | LIBC_INLINE static void repmovsb(void *dst, const void *src, size_t count) { |
54 | asm volatile("rep movsb" : "+D" (dst), "+S" (src), "+c" (count) : : "memory" ); |
55 | } |
56 | }; |
57 | |
58 | } // namespace LIBC_NAMESPACE::x86 |
59 | |
60 | namespace LIBC_NAMESPACE::generic { |
61 | |
62 | /////////////////////////////////////////////////////////////////////////////// |
63 | // Specializations for uint16_t |
64 | template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {}; |
65 | template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) { |
66 | return load<uint16_t>(ptr: p1, offset) == load<uint16_t>(ptr: p2, offset); |
67 | } |
68 | template <> |
69 | LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) { |
70 | return load<uint16_t>(ptr: p1, offset) ^ load<uint16_t>(ptr: p2, offset); |
71 | } |
72 | template <> |
73 | LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) { |
74 | return static_cast<int32_t>(load_be<uint16_t>(ptr: p1, offset)) - |
75 | static_cast<int32_t>(load_be<uint16_t>(ptr: p2, offset)); |
76 | } |
77 | template <> |
78 | LIBC_INLINE MemcmpReturnType cmp_neq<uint16_t>(CPtr p1, CPtr p2, size_t offset); |
79 | |
80 | /////////////////////////////////////////////////////////////////////////////// |
81 | // Specializations for uint32_t |
82 | template <> struct cmp_is_expensive<uint32_t> : public cpp::false_type {}; |
83 | template <> LIBC_INLINE bool eq<uint32_t>(CPtr p1, CPtr p2, size_t offset) { |
84 | return load<uint32_t>(ptr: p1, offset) == load<uint32_t>(ptr: p2, offset); |
85 | } |
86 | template <> |
87 | LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) { |
88 | return load<uint32_t>(ptr: p1, offset) ^ load<uint32_t>(ptr: p2, offset); |
89 | } |
90 | template <> |
91 | LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) { |
92 | const auto a = load_be<uint32_t>(ptr: p1, offset); |
93 | const auto b = load_be<uint32_t>(ptr: p2, offset); |
94 | return cmp_uint32_t(a, b); |
95 | } |
96 | template <> |
97 | LIBC_INLINE MemcmpReturnType cmp_neq<uint32_t>(CPtr p1, CPtr p2, size_t offset); |
98 | |
99 | /////////////////////////////////////////////////////////////////////////////// |
100 | // Specializations for uint64_t |
101 | template <> struct cmp_is_expensive<uint64_t> : public cpp::true_type {}; |
102 | template <> LIBC_INLINE bool eq<uint64_t>(CPtr p1, CPtr p2, size_t offset) { |
103 | return load<uint64_t>(ptr: p1, offset) == load<uint64_t>(ptr: p2, offset); |
104 | } |
105 | template <> |
106 | LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) { |
107 | return !eq<uint64_t>(p1, p2, offset); |
108 | } |
109 | template <> |
110 | LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset); |
111 | template <> |
112 | LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2, |
113 | size_t offset) { |
114 | const auto a = load_be<uint64_t>(ptr: p1, offset); |
115 | const auto b = load_be<uint64_t>(ptr: p2, offset); |
116 | return cmp_neq_uint64_t(a, b); |
117 | } |
118 | |
119 | // SIMD types are defined with attributes. e.g., '__m128i' is defined as |
120 | // long long __attribute__((__vector_size__(16), __aligned__(16))) |
121 | // When we use these SIMD types in template specialization GCC complains: |
122 | // "ignoring attributes on template argument ā__m128iā [-Wignored-attributes]" |
123 | // Therefore, we disable this warning in this file. |
124 | #pragma GCC diagnostic push |
125 | #pragma GCC diagnostic ignored "-Wignored-attributes" |
126 | |
127 | /////////////////////////////////////////////////////////////////////////////// |
128 | // Specializations for __m128i |
129 | #if defined(__SSE4_1__) |
130 | template <> struct is_vector<__m128i> : cpp::true_type {}; |
131 | template <> struct cmp_is_expensive<__m128i> : cpp::true_type {}; |
132 | LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) { |
133 | return _mm_max_epu8(a: a, b: b); |
134 | } |
135 | LIBC_INLINE __m128i bytewise_reverse(__m128i value) { |
136 | return _mm_shuffle_epi8(a: value, b: _mm_set_epi8(b15: 0, b14: 1, b13: 2, b12: 3, b11: 4, b10: 5, b9: 6, b8: 7, // |
137 | b7: 8, b6: 9, b5: 10, b4: 11, b3: 12, b2: 13, b1: 14, b0: 15)); |
138 | } |
139 | LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) { |
140 | return static_cast<uint16_t>( |
141 | _mm_movemask_epi8(a: bytewise_reverse(value: _mm_cmpeq_epi8(a: max, b: value)))); |
142 | } |
143 | template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) { |
144 | const auto a = load<__m128i>(ptr: p1, offset); |
145 | const auto b = load<__m128i>(ptr: p2, offset); |
146 | const auto xored = _mm_xor_si128(a: a, b: b); |
147 | return _mm_testz_si128(M: xored, V: xored) == 1; // 1 iff xored == 0 |
148 | } |
149 | template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) { |
150 | const auto a = load<__m128i>(ptr: p1, offset); |
151 | const auto b = load<__m128i>(ptr: p2, offset); |
152 | const auto xored = _mm_xor_si128(a: a, b: b); |
153 | return _mm_testz_si128(M: xored, V: xored) == 0; // 0 iff xored != 0 |
154 | } |
155 | template <> |
156 | LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) { |
157 | const auto a = load<__m128i>(ptr: p1, offset); |
158 | const auto b = load<__m128i>(ptr: p2, offset); |
159 | const auto vmax = bytewise_max(a, b); |
160 | const auto le = big_endian_cmp_mask(max: vmax, value: b); |
161 | const auto ge = big_endian_cmp_mask(max: vmax, value: a); |
162 | static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint16_t>); |
163 | return static_cast<int32_t>(ge) - static_cast<int32_t>(le); |
164 | } |
165 | #endif // __SSE4_1__ |
166 | |
167 | /////////////////////////////////////////////////////////////////////////////// |
168 | // Specializations for __m256i |
169 | #if defined(__AVX__) |
170 | template <> struct is_vector<__m256i> : cpp::true_type {}; |
171 | template <> struct cmp_is_expensive<__m256i> : cpp::true_type {}; |
172 | template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) { |
173 | const auto a = load<__m256i>(ptr: p1, offset); |
174 | const auto b = load<__m256i>(ptr: p2, offset); |
175 | const auto xored = _mm256_castps_si256( |
176 | a: _mm256_xor_ps(a: _mm256_castsi256_ps(a: a), b: _mm256_castsi256_ps(a: b))); |
177 | return _mm256_testz_si256(a: xored, b: xored) == 1; // 1 iff xored == 0 |
178 | } |
179 | template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) { |
180 | const auto a = load<__m256i>(ptr: p1, offset); |
181 | const auto b = load<__m256i>(ptr: p2, offset); |
182 | const auto xored = _mm256_castps_si256( |
183 | a: _mm256_xor_ps(a: _mm256_castsi256_ps(a: a), b: _mm256_castsi256_ps(a: b))); |
184 | return _mm256_testz_si256(a: xored, b: xored) == 0; // 0 iff xored != 0 |
185 | } |
186 | #endif // __AVX__ |
187 | |
188 | #if defined(__AVX2__) |
189 | LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) { |
190 | return _mm256_max_epu8(a: a, b: b); |
191 | } |
192 | LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) { |
193 | // Bytewise comparison of 'max' and 'value'. |
194 | const __m256i little_endian_byte_mask = _mm256_cmpeq_epi8(a: max, b: value); |
195 | // Because x86 is little endian, bytes in the vector must be reversed before |
196 | // using movemask. |
197 | #if defined(__AVX512VBMI__) && defined(__AVX512VL__) |
198 | // When AVX512BMI is available we can completely reverse the vector through |
199 | // VPERMB __m256i _mm256_permutexvar_epi8( __m256i idx, __m256i a); |
200 | const __m256i big_endian_byte_mask = |
201 | _mm256_permutexvar_epi8(_mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // |
202 | 8, 9, 10, 11, 12, 13, 14, 15, // |
203 | 16, 17, 18, 19, 20, 21, 22, 23, // |
204 | 24, 25, 26, 27, 28, 29, 30, 31), |
205 | little_endian_byte_mask); |
206 | // And turn the byte vector mask into an 'uint32_t' for direct scalar |
207 | // comparison. |
208 | return _mm256_movemask_epi8(big_endian_byte_mask); |
209 | #else |
210 | // We can't byte-reverse '__m256i' in a single instruction with AVX2. |
211 | // '_mm256_shuffle_epi8' can only shuffle within each 16-byte lane |
212 | // leading to: |
213 | // ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, |
214 | // 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16] |
215 | // So we first shuffle each 16-byte lane leading to half-reversed vector mask. |
216 | const __m256i half_reversed = _mm256_shuffle_epi8( |
217 | a: little_endian_byte_mask, b: _mm256_set_epi8(b31: 0, b30: 1, b29: 2, b28: 3, b27: 4, b26: 5, b25: 6, b24: 7, // |
218 | b23: 8, b22: 9, b21: 10, b20: 11, b19: 12, b18: 13, b17: 14, b16: 15, // |
219 | b15: 0, b14: 1, b13: 2, b12: 3, b11: 4, b10: 5, b09: 6, b08: 7, // |
220 | b07: 8, b06: 9, b05: 10, b04: 11, b03: 12, b02: 13, b01: 14, b00: 15)); |
221 | // Then we turn the vector into an uint32_t. |
222 | const uint32_t half_reversed_scalar = _mm256_movemask_epi8(a: half_reversed); |
223 | // And swap the lower and upper parts. This is optimized into a single `rorx` |
224 | // instruction. |
225 | return (half_reversed_scalar << 16) | (half_reversed_scalar >> 16); |
226 | #endif |
227 | } |
228 | template <> |
229 | LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) { |
230 | const auto a = load<__m256i>(ptr: p1, offset); |
231 | const auto b = load<__m256i>(ptr: p2, offset); |
232 | const auto vmax = bytewise_max(a, b); |
233 | const auto le = big_endian_cmp_mask(max: vmax, value: b); |
234 | const auto ge = big_endian_cmp_mask(max: vmax, value: a); |
235 | static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint32_t>); |
236 | return cmp_neq_uint64_t(a: ge, b: le); |
237 | } |
238 | #endif // __AVX2__ |
239 | |
240 | /////////////////////////////////////////////////////////////////////////////// |
241 | // Specializations for __m512i |
242 | #if defined(__AVX512BW__) |
243 | template <> struct is_vector<__m512i> : cpp::true_type {}; |
244 | template <> struct cmp_is_expensive<__m512i> : cpp::true_type {}; |
245 | LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) { |
246 | return _mm512_max_epu8(a, b); |
247 | } |
248 | LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) { |
249 | // The AVX512BMI version is disabled due to bad codegen. |
250 | // https://github.com/llvm/llvm-project/issues/77459 |
251 | // https://github.com/llvm/llvm-project/pull/77081 |
252 | // TODO: Re-enable when clang version meets the fixed version. |
253 | #if false && defined(__AVX512VBMI__) |
254 | // When AVX512BMI is available we can completely reverse the vector through |
255 | // VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a); |
256 | const auto indices = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // |
257 | 8, 9, 10, 11, 12, 13, 14, 15, // |
258 | 16, 17, 18, 19, 20, 21, 22, 23, // |
259 | 24, 25, 26, 27, 28, 29, 30, 31, // |
260 | 32, 33, 34, 35, 36, 37, 38, 39, // |
261 | 40, 41, 42, 43, 44, 45, 46, 47, // |
262 | 48, 49, 50, 51, 52, 53, 54, 55, // |
263 | 56, 57, 58, 59, 60, 61, 62, 63); |
264 | // Then we compute the mask for equal bytes. |
265 | return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max), // |
266 | _mm512_permutexvar_epi8(indices, value)); |
267 | #else |
268 | // We can't byte-reverse '__m512i' in a single instruction with __AVX512BW__. |
269 | // '_mm512_shuffle_epi8' can only shuffle within each 16-byte lane. |
270 | // So we only reverse groups of 8 bytes, these groups are necessarily within a |
271 | // 16-byte lane. |
272 | // zmm = | 16 bytes | 16 bytes | 16 bytes | 16 bytes | |
273 | // zmm = | <8> | <8> | <8> | <8> | <8> | <8> | <8> | <8> | |
274 | const __m512i indices = _mm512_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, // |
275 | 0, 1, 2, 3, 4, 5, 6, 7, // |
276 | 8, 9, 10, 11, 12, 13, 14, 15, // |
277 | 0, 1, 2, 3, 4, 5, 6, 7, // |
278 | 8, 9, 10, 11, 12, 13, 14, 15, // |
279 | 0, 1, 2, 3, 4, 5, 6, 7, // |
280 | 8, 9, 10, 11, 12, 13, 14, 15, // |
281 | 0, 1, 2, 3, 4, 5, 6, 7); |
282 | // Then we compute the mask for equal bytes. In this mask the bits of each |
283 | // byte are already reversed but the byte themselves should be reversed, this |
284 | // is done by using a bswap instruction. |
285 | return __builtin_bswap64( |
286 | _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(max, indices), // |
287 | _mm512_shuffle_epi8(value, indices))); |
288 | |
289 | #endif |
290 | } |
291 | template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) { |
292 | const auto a = load<__m512i>(p1, offset); |
293 | const auto b = load<__m512i>(p2, offset); |
294 | return _mm512_cmpneq_epi8_mask(a, b) == 0; |
295 | } |
296 | template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) { |
297 | const auto a = load<__m512i>(p1, offset); |
298 | const auto b = load<__m512i>(p2, offset); |
299 | const uint64_t xored = _mm512_cmpneq_epi8_mask(a, b); |
300 | return static_cast<uint32_t>(xored >> 32) | |
301 | static_cast<uint32_t>(xored & 0xFFFFFFFF); |
302 | } |
303 | template <> |
304 | LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) { |
305 | const auto a = load<__m512i>(p1, offset); |
306 | const auto b = load<__m512i>(p2, offset); |
307 | const auto vmax = bytewise_max(a, b); |
308 | const auto le = big_endian_cmp_mask(vmax, b); |
309 | const auto ge = big_endian_cmp_mask(vmax, a); |
310 | static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint64_t>); |
311 | return cmp_neq_uint64_t(ge, le); |
312 | } |
313 | #endif // __AVX512BW__ |
314 | |
315 | #pragma GCC diagnostic pop |
316 | |
317 | } // namespace LIBC_NAMESPACE::generic |
318 | |
319 | #endif // LIBC_TARGET_ARCH_IS_X86_64 |
320 | |
321 | #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H |
322 | |