1 | //===-- Memset implementation for x86_64 ------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H |
9 | #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H |
10 | |
11 | #include "src/__support/macros/attributes.h" // LIBC_INLINE |
12 | #include "src/string/memory_utils/op_generic.h" |
13 | #include "src/string/memory_utils/op_x86.h" |
14 | #include "src/string/memory_utils/utils.h" // Ptr, CPtr |
15 | |
16 | #include <stddef.h> // size_t |
17 | |
18 | namespace LIBC_NAMESPACE { |
19 | namespace x86 { |
20 | // Size of one cache line for software prefetching |
21 | LIBC_INLINE_VAR constexpr size_t K_ONE_CACHELINE_SIZE = 64; |
22 | LIBC_INLINE_VAR constexpr size_t K_TWO_CACHELINES_SIZE = |
23 | K_ONE_CACHELINE_SIZE * 2; |
24 | LIBC_INLINE_VAR constexpr size_t K_FIVE_CACHELINES_SIZE = |
25 | K_ONE_CACHELINE_SIZE * 5; |
26 | |
27 | LIBC_INLINE_VAR constexpr bool K_USE_SOFTWARE_PREFETCHING_MEMSET = |
28 | LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING); |
29 | |
30 | } // namespace x86 |
31 | |
32 | #if defined(__AVX512F__) |
33 | using uint128_t = generic_v128; |
34 | using uint256_t = generic_v256; |
35 | using uint512_t = generic_v512; |
36 | #elif defined(__AVX__) |
37 | using uint128_t = generic_v128; |
38 | using uint256_t = generic_v256; |
39 | using uint512_t = cpp::array<generic_v256, 2>; |
40 | #elif defined(__SSE2__) |
41 | using uint128_t = generic_v128; |
42 | using uint256_t = cpp::array<generic_v128, 2>; |
43 | using uint512_t = cpp::array<generic_v128, 4>; |
44 | #else |
45 | using uint128_t = cpp::array<uint64_t, 2>; |
46 | using uint256_t = cpp::array<uint64_t, 4>; |
47 | using uint512_t = cpp::array<uint64_t, 8>; |
48 | #endif |
49 | |
50 | [[maybe_unused]] LIBC_INLINE static void |
51 | inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) { |
52 | constexpr size_t PREFETCH_DISTANCE = x86::K_FIVE_CACHELINES_SIZE; |
53 | constexpr size_t PREFETCH_DEGREE = x86::K_TWO_CACHELINES_SIZE; |
54 | constexpr size_t SIZE = sizeof(uint256_t); |
55 | // Prefetch one cache line |
56 | prefetch_for_write(dst: dst + x86::K_ONE_CACHELINE_SIZE); |
57 | if (count <= 128) |
58 | return generic::Memset<uint512_t>::head_tail(dst, value, count); |
59 | // Prefetch the second cache line |
60 | prefetch_for_write(dst: dst + x86::K_TWO_CACHELINES_SIZE); |
61 | // Aligned loop |
62 | generic::Memset<uint256_t>::block(dst, value); |
63 | align_to_next_boundary<32>(p1&: dst, count); |
64 | if (count <= 192) { |
65 | return generic::Memset<uint256_t>::loop_and_tail(dst, value, count); |
66 | } else { |
67 | generic::MemsetSequence<uint512_t, uint256_t>::block(dst, value); |
68 | size_t offset = 96; |
69 | while (offset + PREFETCH_DEGREE + SIZE <= count) { |
70 | prefetch_for_write(dst: dst + offset + PREFETCH_DISTANCE); |
71 | prefetch_for_write(dst: dst + offset + PREFETCH_DISTANCE + |
72 | x86::K_ONE_CACHELINE_SIZE); |
73 | for (size_t i = 0; i < PREFETCH_DEGREE; i += SIZE, offset += SIZE) |
74 | generic::Memset<uint256_t>::block(dst: dst + offset, value); |
75 | } |
76 | generic::Memset<uint256_t>::loop_and_tail_offset(dst, value, count, offset); |
77 | } |
78 | } |
79 | |
80 | [[maybe_unused]] LIBC_INLINE static void |
81 | inline_memset_x86(Ptr dst, uint8_t value, size_t count) { |
82 | if (count == 0) |
83 | return; |
84 | if (count == 1) |
85 | return generic::Memset<uint8_t>::block(dst, value); |
86 | if (count == 2) |
87 | return generic::Memset<uint16_t>::block(dst, value); |
88 | if (count == 3) |
89 | return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value); |
90 | if (count <= 8) |
91 | return generic::Memset<uint32_t>::head_tail(dst, value, count); |
92 | if (count <= 16) |
93 | return generic::Memset<uint64_t>::head_tail(dst, value, count); |
94 | if (count <= 32) |
95 | return generic::Memset<uint128_t>::head_tail(dst, value, count); |
96 | if (count <= 64) |
97 | return generic::Memset<uint256_t>::head_tail(dst, value, count); |
98 | if constexpr (x86::K_USE_SOFTWARE_PREFETCHING_MEMSET) |
99 | return inline_memset_x86_gt64_sw_prefetching(dst, value, count); |
100 | if (count <= 128) |
101 | return generic::Memset<uint512_t>::head_tail(dst, value, count); |
102 | // Aligned loop |
103 | generic::Memset<uint256_t>::block(dst, value); |
104 | align_to_next_boundary<32>(p1&: dst, count); |
105 | return generic::Memset<uint256_t>::loop_and_tail(dst, value, count); |
106 | } |
107 | } // namespace LIBC_NAMESPACE |
108 | |
109 | #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H |
110 | |