1 | //===-- Memcpy implementation for x86_64 ------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMCPY_H |
9 | #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMCPY_H |
10 | |
11 | #include "src/__support/macros/attributes.h" // LIBC_INLINE_VAR |
12 | #include "src/__support/macros/config.h" // LIBC_INLINE |
13 | #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY |
14 | #include "src/string/memory_utils/op_builtin.h" |
15 | #include "src/string/memory_utils/op_x86.h" |
16 | #include "src/string/memory_utils/utils.h" |
17 | |
18 | #include <stddef.h> // size_t |
19 | #include <stdint.h> // SIZE_MAX |
20 | |
21 | #ifdef LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB |
22 | #error LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead. |
23 | #endif // LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB |
24 | |
25 | #ifdef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE |
26 | #error LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead. |
27 | #endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE |
28 | |
29 | namespace LIBC_NAMESPACE { |
30 | |
31 | namespace x86 { |
32 | |
33 | LIBC_INLINE_VAR constexpr size_t K_ONE_CACHELINE = 64; |
34 | LIBC_INLINE_VAR constexpr size_t K_TWO_CACHELINES = 2 * K_ONE_CACHELINE; |
35 | LIBC_INLINE_VAR constexpr size_t K_THREE_CACHELINES = 3 * K_ONE_CACHELINE; |
36 | |
37 | LIBC_INLINE_VAR constexpr bool K_USE_SOFTWARE_PREFETCHING = |
38 | LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING); |
39 | |
40 | // Whether to use rep;movsb exclusively (0), not at all (SIZE_MAX), or only |
41 | // above a certain threshold. Defaults to "do not use rep;movsb". |
42 | #ifndef LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE |
43 | #define LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE SIZE_MAX |
44 | #endif |
45 | LIBC_INLINE_VAR constexpr size_t K_REP_MOVSB_THRESHOLD = |
46 | LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE; |
47 | |
48 | } // namespace x86 |
49 | |
50 | [[maybe_unused]] LIBC_INLINE void |
51 | inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src, |
52 | size_t count) { |
53 | if (count <= 128) |
54 | return builtin::Memcpy<64>::head_tail(dst, src, count); |
55 | builtin::Memcpy<32>::block(dst, src); |
56 | align_to_next_boundary<32, Arg::Dst>(p1&: dst, p2&: src, count); |
57 | return builtin::Memcpy<32>::loop_and_tail(dst, src, count); |
58 | } |
59 | |
60 | [[maybe_unused]] LIBC_INLINE void |
61 | inline_memcpy_x86_avx_ge64(Ptr __restrict dst, CPtr __restrict src, |
62 | size_t count) { |
63 | if (count <= 128) |
64 | return builtin::Memcpy<64>::head_tail(dst, src, count); |
65 | if (count < 256) |
66 | return builtin::Memcpy<128>::head_tail(dst, src, count); |
67 | builtin::Memcpy<32>::block(dst, src); |
68 | align_to_next_boundary<32, Arg::Dst>(p1&: dst, p2&: src, count); |
69 | return builtin::Memcpy<64>::loop_and_tail(dst, src, count); |
70 | } |
71 | |
72 | [[maybe_unused]] LIBC_INLINE void |
73 | inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst, |
74 | CPtr __restrict src, size_t count) { |
75 | using namespace LIBC_NAMESPACE::x86; |
76 | prefetch_to_local_cache(dst: src + K_ONE_CACHELINE); |
77 | if (count <= 128) |
78 | return builtin::Memcpy<64>::head_tail(dst, src, count); |
79 | prefetch_to_local_cache(dst: src + K_TWO_CACHELINES); |
80 | // Aligning 'dst' on a 32B boundary. |
81 | builtin::Memcpy<32>::block(dst, src); |
82 | align_to_next_boundary<32, Arg::Dst>(p1&: dst, p2&: src, count); |
83 | builtin::Memcpy<96>::block(dst, src); |
84 | size_t offset = 96; |
85 | // At this point: |
86 | // - we copied between 96B and 128B, |
87 | // - we prefetched cachelines at 'src + 64' and 'src + 128', |
88 | // - 'dst' is 32B aligned, |
89 | // - count >= 128. |
90 | if (count < 352) { |
91 | // Two cache lines at a time. |
92 | while (offset + K_TWO_CACHELINES + 32 <= count) { |
93 | prefetch_to_local_cache(dst: src + offset + K_ONE_CACHELINE); |
94 | prefetch_to_local_cache(dst: src + offset + K_TWO_CACHELINES); |
95 | builtin::Memcpy<K_TWO_CACHELINES>::block_offset(dst, src, offset); |
96 | offset += K_TWO_CACHELINES; |
97 | } |
98 | } else { |
99 | // Three cache lines at a time. |
100 | while (offset + K_THREE_CACHELINES + 32 <= count) { |
101 | prefetch_to_local_cache(dst: src + offset + K_ONE_CACHELINE); |
102 | prefetch_to_local_cache(dst: src + offset + K_TWO_CACHELINES); |
103 | prefetch_to_local_cache(dst: src + offset + K_THREE_CACHELINES); |
104 | // It is likely that this copy will be turned into a 'rep;movsb' on |
105 | // non-AVX machines. |
106 | builtin::Memcpy<K_THREE_CACHELINES>::block_offset(dst, src, offset); |
107 | offset += K_THREE_CACHELINES; |
108 | } |
109 | } |
110 | return builtin::Memcpy<32>::loop_and_tail_offset(dst, src, count, offset); |
111 | } |
112 | |
113 | [[maybe_unused]] LIBC_INLINE void |
114 | inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst, |
115 | CPtr __restrict src, size_t count) { |
116 | using namespace LIBC_NAMESPACE::x86; |
117 | prefetch_to_local_cache(dst: src + K_ONE_CACHELINE); |
118 | if (count <= 128) |
119 | return builtin::Memcpy<64>::head_tail(dst, src, count); |
120 | prefetch_to_local_cache(dst: src + K_TWO_CACHELINES); |
121 | prefetch_to_local_cache(dst: src + K_THREE_CACHELINES); |
122 | if (count < 256) |
123 | return builtin::Memcpy<128>::head_tail(dst, src, count); |
124 | // Aligning 'dst' on a 32B boundary. |
125 | builtin::Memcpy<32>::block(dst, src); |
126 | align_to_next_boundary<32, Arg::Dst>(p1&: dst, p2&: src, count); |
127 | builtin::Memcpy<224>::block(dst, src); |
128 | size_t offset = 224; |
129 | // At this point: |
130 | // - we copied between 224B and 256B, |
131 | // - we prefetched cachelines at 'src + 64', 'src + 128', and 'src + 196' |
132 | // - 'dst' is 32B aligned, |
133 | // - count >= 128. |
134 | while (offset + K_THREE_CACHELINES + 64 <= count) { |
135 | // Three cache lines at a time. |
136 | prefetch_to_local_cache(dst: src + offset + K_ONE_CACHELINE); |
137 | prefetch_to_local_cache(dst: src + offset + K_TWO_CACHELINES); |
138 | prefetch_to_local_cache(dst: src + offset + K_THREE_CACHELINES); |
139 | builtin::Memcpy<K_THREE_CACHELINES>::block_offset(dst, src, offset); |
140 | offset += K_THREE_CACHELINES; |
141 | } |
142 | return builtin::Memcpy<64>::loop_and_tail_offset(dst, src, count, offset); |
143 | } |
144 | |
145 | [[maybe_unused]] LIBC_INLINE void |
146 | inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) { |
147 | #if defined(__AVX512F__) |
148 | constexpr size_t VECTOR_SIZE = 64; |
149 | #elif defined(__AVX__) |
150 | constexpr size_t VECTOR_SIZE = 32; |
151 | #elif defined(__SSE2__) |
152 | constexpr size_t VECTOR_SIZE = 16; |
153 | #else |
154 | constexpr size_t VECTOR_SIZE = 8; |
155 | #endif |
156 | if (count == 0) |
157 | return; |
158 | if (count == 1) |
159 | return builtin::Memcpy<1>::block(dst, src); |
160 | if (count == 2) |
161 | return builtin::Memcpy<2>::block(dst, src); |
162 | if (count == 3) |
163 | return builtin::Memcpy<3>::block(dst, src); |
164 | if (count == 4) |
165 | return builtin::Memcpy<4>::block(dst, src); |
166 | if (count < 8) |
167 | return builtin::Memcpy<4>::head_tail(dst, src, count); |
168 | // If count is equal to a power of 2, we can handle it as head-tail |
169 | // of both smaller size and larger size (head-tail are either |
170 | // non-overlapping for smaller size, or completely collapsed |
171 | // for larger size). It seems to be more profitable to do the copy |
172 | // with the larger size, if it's natively supported (e.g. doing |
173 | // 2 collapsed 32-byte moves for count=64 if AVX2 is supported). |
174 | // But it's not profitable to use larger size if it's not natively |
175 | // supported: we will both use more instructions and handle fewer |
176 | // sizes in earlier branches. |
177 | if (VECTOR_SIZE >= 16 ? count < 16 : count <= 16) |
178 | return builtin::Memcpy<8>::head_tail(dst, src, count); |
179 | if (VECTOR_SIZE >= 32 ? count < 32 : count <= 32) |
180 | return builtin::Memcpy<16>::head_tail(dst, src, count); |
181 | if (VECTOR_SIZE >= 64 ? count < 64 : count <= 64) |
182 | return builtin::Memcpy<32>::head_tail(dst, src, count); |
183 | if constexpr (x86::K_AVX) { |
184 | if constexpr (x86::K_USE_SOFTWARE_PREFETCHING) { |
185 | return inline_memcpy_x86_avx_ge64_sw_prefetching(dst, src, count); |
186 | } else { |
187 | return inline_memcpy_x86_avx_ge64(dst, src, count); |
188 | } |
189 | } else { |
190 | if constexpr (x86::K_USE_SOFTWARE_PREFETCHING) { |
191 | return inline_memcpy_x86_sse2_ge64_sw_prefetching(dst, src, count); |
192 | } else { |
193 | return inline_memcpy_x86_sse2_ge64(dst, src, count); |
194 | } |
195 | } |
196 | } |
197 | |
198 | [[maybe_unused]] LIBC_INLINE void |
199 | inline_memcpy_x86_maybe_interpose_repmovsb(Ptr __restrict dst, |
200 | CPtr __restrict src, size_t count) { |
201 | if constexpr (x86::K_REP_MOVSB_THRESHOLD == 0) { |
202 | return x86::Memcpy::repmovsb(dst, src, count); |
203 | } else if constexpr (x86::K_REP_MOVSB_THRESHOLD == SIZE_MAX) { |
204 | return inline_memcpy_x86(dst, src, count); |
205 | } else { |
206 | if (LIBC_UNLIKELY(count >= x86::K_REP_MOVSB_THRESHOLD)) |
207 | return x86::Memcpy::repmovsb(dst, src, count); |
208 | else |
209 | return inline_memcpy_x86(dst, src, count); |
210 | } |
211 | } |
212 | |
213 | } // namespace LIBC_NAMESPACE |
214 | |
215 | #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMCPY_H |
216 | |