1//===-- x86 implementation of memory function building blocks -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file provides x86 specific building blocks to compose memory functions.
10//
11//===----------------------------------------------------------------------===//
12#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
13#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
14
15#include "src/__support/macros/properties/architectures.h"
16
17#if defined(LIBC_TARGET_ARCH_IS_X86_64)
18
19#include "src/__support/common.h"
20#include "src/string/memory_utils/op_builtin.h"
21#include "src/string/memory_utils/op_generic.h"
22
23#if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__AVX2__) || \
24 defined(__SSE2__)
25#include <immintrin.h>
26#endif
27
28// Define fake functions to prevent the compiler from failing on undefined
29// functions in case the CPU extension is not present.
30#if !defined(__AVX512BW__) && (defined(_MSC_VER) || defined(__SCE__))
31#define _mm512_cmpneq_epi8_mask(A, B) 0
32#endif
33#if !defined(__AVX2__) && (defined(_MSC_VER) || defined(__SCE__))
34#define _mm256_movemask_epi8(A) 0
35#endif
36#if !defined(__SSE2__) && (defined(_MSC_VER) || defined(__SCE__))
37#define _mm_movemask_epi8(A) 0
38#endif
39
40namespace LIBC_NAMESPACE::x86 {
41
42// A set of constants to check compile time features.
43LIBC_INLINE_VAR constexpr bool K_SSE2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
44LIBC_INLINE_VAR constexpr bool K_SSE41 = LLVM_LIBC_IS_DEFINED(__SSE4_1__);
45LIBC_INLINE_VAR constexpr bool K_AVX = LLVM_LIBC_IS_DEFINED(__AVX__);
46LIBC_INLINE_VAR constexpr bool K_AVX2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
47LIBC_INLINE_VAR constexpr bool K_AVX512_F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
48LIBC_INLINE_VAR constexpr bool K_AVX512_BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
49
50///////////////////////////////////////////////////////////////////////////////
51// Memcpy repmovsb implementation
52struct Memcpy {
53 LIBC_INLINE static void repmovsb(void *dst, const void *src, size_t count) {
54 asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
55 }
56};
57
58} // namespace LIBC_NAMESPACE::x86
59
60namespace LIBC_NAMESPACE::generic {
61
62///////////////////////////////////////////////////////////////////////////////
63// Specializations for uint16_t
64template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
65template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
66 return load<uint16_t>(ptr: p1, offset) == load<uint16_t>(ptr: p2, offset);
67}
68template <>
69LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
70 return load<uint16_t>(ptr: p1, offset) ^ load<uint16_t>(ptr: p2, offset);
71}
72template <>
73LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
74 return static_cast<int32_t>(load_be<uint16_t>(ptr: p1, offset)) -
75 static_cast<int32_t>(load_be<uint16_t>(ptr: p2, offset));
76}
77template <>
78LIBC_INLINE MemcmpReturnType cmp_neq<uint16_t>(CPtr p1, CPtr p2, size_t offset);
79
80///////////////////////////////////////////////////////////////////////////////
81// Specializations for uint32_t
82template <> struct cmp_is_expensive<uint32_t> : public cpp::false_type {};
83template <> LIBC_INLINE bool eq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
84 return load<uint32_t>(ptr: p1, offset) == load<uint32_t>(ptr: p2, offset);
85}
86template <>
87LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
88 return load<uint32_t>(ptr: p1, offset) ^ load<uint32_t>(ptr: p2, offset);
89}
90template <>
91LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
92 const auto a = load_be<uint32_t>(ptr: p1, offset);
93 const auto b = load_be<uint32_t>(ptr: p2, offset);
94 return cmp_uint32_t(a, b);
95}
96template <>
97LIBC_INLINE MemcmpReturnType cmp_neq<uint32_t>(CPtr p1, CPtr p2, size_t offset);
98
99///////////////////////////////////////////////////////////////////////////////
100// Specializations for uint64_t
101template <> struct cmp_is_expensive<uint64_t> : public cpp::true_type {};
102template <> LIBC_INLINE bool eq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
103 return load<uint64_t>(ptr: p1, offset) == load<uint64_t>(ptr: p2, offset);
104}
105template <>
106LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
107 return !eq<uint64_t>(p1, p2, offset);
108}
109template <>
110LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset);
111template <>
112LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
113 size_t offset) {
114 const auto a = load_be<uint64_t>(ptr: p1, offset);
115 const auto b = load_be<uint64_t>(ptr: p2, offset);
116 return cmp_neq_uint64_t(a, b);
117}
118
119// SIMD types are defined with attributes. e.g., '__m128i' is defined as
120// long long __attribute__((__vector_size__(16), __aligned__(16)))
121// When we use these SIMD types in template specialization GCC complains:
122// "ignoring attributes on template argument ā€˜__m128iā€™ [-Wignored-attributes]"
123// Therefore, we disable this warning in this file.
124#pragma GCC diagnostic push
125#pragma GCC diagnostic ignored "-Wignored-attributes"
126
127///////////////////////////////////////////////////////////////////////////////
128// Specializations for __m128i
129#if defined(__SSE4_1__)
130template <> struct is_vector<__m128i> : cpp::true_type {};
131template <> struct cmp_is_expensive<__m128i> : cpp::true_type {};
132LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) {
133 return _mm_max_epu8(a: a, b: b);
134}
135LIBC_INLINE __m128i bytewise_reverse(__m128i value) {
136 return _mm_shuffle_epi8(a: value, b: _mm_set_epi8(b15: 0, b14: 1, b13: 2, b12: 3, b11: 4, b10: 5, b9: 6, b8: 7, //
137 b7: 8, b6: 9, b5: 10, b4: 11, b3: 12, b2: 13, b1: 14, b0: 15));
138}
139LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
140 return static_cast<uint16_t>(
141 _mm_movemask_epi8(a: bytewise_reverse(value: _mm_cmpeq_epi8(a: max, b: value))));
142}
143template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
144 const auto a = load<__m128i>(ptr: p1, offset);
145 const auto b = load<__m128i>(ptr: p2, offset);
146 const auto xored = _mm_xor_si128(a: a, b: b);
147 return _mm_testz_si128(M: xored, V: xored) == 1; // 1 iff xored == 0
148}
149template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
150 const auto a = load<__m128i>(ptr: p1, offset);
151 const auto b = load<__m128i>(ptr: p2, offset);
152 const auto xored = _mm_xor_si128(a: a, b: b);
153 return _mm_testz_si128(M: xored, V: xored) == 0; // 0 iff xored != 0
154}
155template <>
156LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
157 const auto a = load<__m128i>(ptr: p1, offset);
158 const auto b = load<__m128i>(ptr: p2, offset);
159 const auto vmax = bytewise_max(a, b);
160 const auto le = big_endian_cmp_mask(max: vmax, value: b);
161 const auto ge = big_endian_cmp_mask(max: vmax, value: a);
162 static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint16_t>);
163 return static_cast<int32_t>(ge) - static_cast<int32_t>(le);
164}
165#endif // __SSE4_1__
166
167///////////////////////////////////////////////////////////////////////////////
168// Specializations for __m256i
169#if defined(__AVX__)
170template <> struct is_vector<__m256i> : cpp::true_type {};
171template <> struct cmp_is_expensive<__m256i> : cpp::true_type {};
172template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
173 const auto a = load<__m256i>(ptr: p1, offset);
174 const auto b = load<__m256i>(ptr: p2, offset);
175 const auto xored = _mm256_castps_si256(
176 a: _mm256_xor_ps(a: _mm256_castsi256_ps(a: a), b: _mm256_castsi256_ps(a: b)));
177 return _mm256_testz_si256(a: xored, b: xored) == 1; // 1 iff xored == 0
178}
179template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
180 const auto a = load<__m256i>(ptr: p1, offset);
181 const auto b = load<__m256i>(ptr: p2, offset);
182 const auto xored = _mm256_castps_si256(
183 a: _mm256_xor_ps(a: _mm256_castsi256_ps(a: a), b: _mm256_castsi256_ps(a: b)));
184 return _mm256_testz_si256(a: xored, b: xored) == 0; // 0 iff xored != 0
185}
186#endif // __AVX__
187
188#if defined(__AVX2__)
189LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) {
190 return _mm256_max_epu8(a: a, b: b);
191}
192LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) {
193 // Bytewise comparison of 'max' and 'value'.
194 const __m256i little_endian_byte_mask = _mm256_cmpeq_epi8(a: max, b: value);
195 // Because x86 is little endian, bytes in the vector must be reversed before
196 // using movemask.
197#if defined(__AVX512VBMI__) && defined(__AVX512VL__)
198 // When AVX512BMI is available we can completely reverse the vector through
199 // VPERMB __m256i _mm256_permutexvar_epi8( __m256i idx, __m256i a);
200 const __m256i big_endian_byte_mask =
201 _mm256_permutexvar_epi8(_mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
202 8, 9, 10, 11, 12, 13, 14, 15, //
203 16, 17, 18, 19, 20, 21, 22, 23, //
204 24, 25, 26, 27, 28, 29, 30, 31),
205 little_endian_byte_mask);
206 // And turn the byte vector mask into an 'uint32_t' for direct scalar
207 // comparison.
208 return _mm256_movemask_epi8(big_endian_byte_mask);
209#else
210 // We can't byte-reverse '__m256i' in a single instruction with AVX2.
211 // '_mm256_shuffle_epi8' can only shuffle within each 16-byte lane
212 // leading to:
213 // ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
214 // 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16]
215 // So we first shuffle each 16-byte lane leading to half-reversed vector mask.
216 const __m256i half_reversed = _mm256_shuffle_epi8(
217 a: little_endian_byte_mask, b: _mm256_set_epi8(b31: 0, b30: 1, b29: 2, b28: 3, b27: 4, b26: 5, b25: 6, b24: 7, //
218 b23: 8, b22: 9, b21: 10, b20: 11, b19: 12, b18: 13, b17: 14, b16: 15, //
219 b15: 0, b14: 1, b13: 2, b12: 3, b11: 4, b10: 5, b09: 6, b08: 7, //
220 b07: 8, b06: 9, b05: 10, b04: 11, b03: 12, b02: 13, b01: 14, b00: 15));
221 // Then we turn the vector into an uint32_t.
222 const uint32_t half_reversed_scalar = _mm256_movemask_epi8(a: half_reversed);
223 // And swap the lower and upper parts. This is optimized into a single `rorx`
224 // instruction.
225 return (half_reversed_scalar << 16) | (half_reversed_scalar >> 16);
226#endif
227}
228template <>
229LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
230 const auto a = load<__m256i>(ptr: p1, offset);
231 const auto b = load<__m256i>(ptr: p2, offset);
232 const auto vmax = bytewise_max(a, b);
233 const auto le = big_endian_cmp_mask(max: vmax, value: b);
234 const auto ge = big_endian_cmp_mask(max: vmax, value: a);
235 static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint32_t>);
236 return cmp_neq_uint64_t(a: ge, b: le);
237}
238#endif // __AVX2__
239
240///////////////////////////////////////////////////////////////////////////////
241// Specializations for __m512i
242#if defined(__AVX512BW__)
243template <> struct is_vector<__m512i> : cpp::true_type {};
244template <> struct cmp_is_expensive<__m512i> : cpp::true_type {};
245LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) {
246 return _mm512_max_epu8(a, b);
247}
248LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) {
249 // The AVX512BMI version is disabled due to bad codegen.
250 // https://github.com/llvm/llvm-project/issues/77459
251 // https://github.com/llvm/llvm-project/pull/77081
252 // TODO: Re-enable when clang version meets the fixed version.
253#if false && defined(__AVX512VBMI__)
254 // When AVX512BMI is available we can completely reverse the vector through
255 // VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a);
256 const auto indices = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
257 8, 9, 10, 11, 12, 13, 14, 15, //
258 16, 17, 18, 19, 20, 21, 22, 23, //
259 24, 25, 26, 27, 28, 29, 30, 31, //
260 32, 33, 34, 35, 36, 37, 38, 39, //
261 40, 41, 42, 43, 44, 45, 46, 47, //
262 48, 49, 50, 51, 52, 53, 54, 55, //
263 56, 57, 58, 59, 60, 61, 62, 63);
264 // Then we compute the mask for equal bytes.
265 return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max), //
266 _mm512_permutexvar_epi8(indices, value));
267#else
268 // We can't byte-reverse '__m512i' in a single instruction with __AVX512BW__.
269 // '_mm512_shuffle_epi8' can only shuffle within each 16-byte lane.
270 // So we only reverse groups of 8 bytes, these groups are necessarily within a
271 // 16-byte lane.
272 // zmm = | 16 bytes | 16 bytes | 16 bytes | 16 bytes |
273 // zmm = | <8> | <8> | <8> | <8> | <8> | <8> | <8> | <8> |
274 const __m512i indices = _mm512_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, //
275 0, 1, 2, 3, 4, 5, 6, 7, //
276 8, 9, 10, 11, 12, 13, 14, 15, //
277 0, 1, 2, 3, 4, 5, 6, 7, //
278 8, 9, 10, 11, 12, 13, 14, 15, //
279 0, 1, 2, 3, 4, 5, 6, 7, //
280 8, 9, 10, 11, 12, 13, 14, 15, //
281 0, 1, 2, 3, 4, 5, 6, 7);
282 // Then we compute the mask for equal bytes. In this mask the bits of each
283 // byte are already reversed but the byte themselves should be reversed, this
284 // is done by using a bswap instruction.
285 return __builtin_bswap64(
286 _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(max, indices), //
287 _mm512_shuffle_epi8(value, indices)));
288
289#endif
290}
291template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
292 const auto a = load<__m512i>(p1, offset);
293 const auto b = load<__m512i>(p2, offset);
294 return _mm512_cmpneq_epi8_mask(a, b) == 0;
295}
296template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
297 const auto a = load<__m512i>(p1, offset);
298 const auto b = load<__m512i>(p2, offset);
299 const uint64_t xored = _mm512_cmpneq_epi8_mask(a, b);
300 return static_cast<uint32_t>(xored >> 32) |
301 static_cast<uint32_t>(xored & 0xFFFFFFFF);
302}
303template <>
304LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
305 const auto a = load<__m512i>(p1, offset);
306 const auto b = load<__m512i>(p2, offset);
307 const auto vmax = bytewise_max(a, b);
308 const auto le = big_endian_cmp_mask(vmax, b);
309 const auto ge = big_endian_cmp_mask(vmax, a);
310 static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint64_t>);
311 return cmp_neq_uint64_t(ge, le);
312}
313#endif // __AVX512BW__
314
315#pragma GCC diagnostic pop
316
317} // namespace LIBC_NAMESPACE::generic
318
319#endif // LIBC_TARGET_ARCH_IS_X86_64
320
321#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
322

source code of libc/src/string/memory_utils/op_x86.h