1/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.
2 Copyright (C) 2016-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include <isa-level.h>
21
22#if ISA_SHOULD_BUILD (4)
23
24# include "asm-syntax.h"
25
26 .section .text.avx512,"ax",@progbits
27ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
28 cmp %RDX_LP, %RCX_LP
29 jb HIDDEN_JUMPTARGET (__chk_fail)
30END (__mempcpy_chk_avx512_no_vzeroupper)
31
32ENTRY (__mempcpy_avx512_no_vzeroupper)
33 mov %RDI_LP, %RAX_LP
34 add %RDX_LP, %RAX_LP
35 jmp L(start)
36END (__mempcpy_avx512_no_vzeroupper)
37
38ENTRY (__memmove_chk_avx512_no_vzeroupper)
39 cmp %RDX_LP, %RCX_LP
40 jb HIDDEN_JUMPTARGET (__chk_fail)
41END (__memmove_chk_avx512_no_vzeroupper)
42
43ENTRY (__memmove_avx512_no_vzeroupper)
44 mov %RDI_LP, %RAX_LP
45# ifdef USE_AS_MEMPCPY
46 add %RDX_LP, %RAX_LP
47# endif
48L(start):
49# ifdef __ILP32__
50 /* Clear the upper 32 bits. */
51 mov %edx, %edx
52# endif
53 lea (%rsi, %rdx), %rcx
54 lea (%rdi, %rdx), %r9
55 cmp $512, %rdx
56 ja L(512bytesormore)
57
58L(check):
59 cmp $16, %rdx
60 jbe L(less_16bytes)
61 cmp $256, %rdx
62 jb L(less_256bytes)
63 vmovups (%rsi), %zmm0
64 vmovups 0x40(%rsi), %zmm1
65 vmovups 0x80(%rsi), %zmm2
66 vmovups 0xC0(%rsi), %zmm3
67 vmovups -0x100(%rcx), %zmm4
68 vmovups -0xC0(%rcx), %zmm5
69 vmovups -0x80(%rcx), %zmm6
70 vmovups -0x40(%rcx), %zmm7
71 vmovups %zmm0, (%rdi)
72 vmovups %zmm1, 0x40(%rdi)
73 vmovups %zmm2, 0x80(%rdi)
74 vmovups %zmm3, 0xC0(%rdi)
75 vmovups %zmm4, -0x100(%r9)
76 vmovups %zmm5, -0xC0(%r9)
77 vmovups %zmm6, -0x80(%r9)
78 vmovups %zmm7, -0x40(%r9)
79 ret
80
81L(less_256bytes):
82 cmp $128, %dl
83 jb L(less_128bytes)
84 vmovups (%rsi), %zmm0
85 vmovups 0x40(%rsi), %zmm1
86 vmovups -0x80(%rcx), %zmm2
87 vmovups -0x40(%rcx), %zmm3
88 vmovups %zmm0, (%rdi)
89 vmovups %zmm1, 0x40(%rdi)
90 vmovups %zmm2, -0x80(%r9)
91 vmovups %zmm3, -0x40(%r9)
92 ret
93
94L(less_128bytes):
95 cmp $64, %dl
96 jb L(less_64bytes)
97 vmovdqu (%rsi), %ymm0
98 vmovdqu 0x20(%rsi), %ymm1
99 vmovdqu -0x40(%rcx), %ymm2
100 vmovdqu -0x20(%rcx), %ymm3
101 vmovdqu %ymm0, (%rdi)
102 vmovdqu %ymm1, 0x20(%rdi)
103 vmovdqu %ymm2, -0x40(%r9)
104 vmovdqu %ymm3, -0x20(%r9)
105 ret
106
107L(less_64bytes):
108 cmp $32, %dl
109 jb L(less_32bytes)
110 vmovdqu (%rsi), %ymm0
111 vmovdqu -0x20(%rcx), %ymm1
112 vmovdqu %ymm0, (%rdi)
113 vmovdqu %ymm1, -0x20(%r9)
114 ret
115
116L(less_32bytes):
117 vmovdqu (%rsi), %xmm0
118 vmovdqu -0x10(%rcx), %xmm1
119 vmovdqu %xmm0, (%rdi)
120 vmovdqu %xmm1, -0x10(%r9)
121 ret
122
123L(less_16bytes):
124 cmp $8, %dl
125 jb L(less_8bytes)
126 movq (%rsi), %rsi
127 movq -0x8(%rcx), %rcx
128 movq %rsi, (%rdi)
129 movq %rcx, -0x8(%r9)
130 ret
131
132L(less_8bytes):
133 cmp $4, %dl
134 jb L(less_4bytes)
135 mov (%rsi), %esi
136 mov -0x4(%rcx), %ecx
137 mov %esi, (%rdi)
138 mov %ecx, -0x4(%r9)
139 ret
140
141L(less_4bytes):
142 cmp $2, %dl
143 jb L(less_2bytes)
144 mov (%rsi), %si
145 mov -0x2(%rcx), %cx
146 mov %si, (%rdi)
147 mov %cx, -0x2(%r9)
148 ret
149
150L(less_2bytes):
151 cmp $1, %dl
152 jb L(less_1bytes)
153 mov (%rsi), %cl
154 mov %cl, (%rdi)
155L(less_1bytes):
156 ret
157
158L(512bytesormore):
159# ifdef SHARED_CACHE_SIZE_HALF
160 mov $SHARED_CACHE_SIZE_HALF, %r8
161# else
162 mov __x86_shared_cache_size_half(%rip), %r8
163# endif
164 cmp %r8, %rdx
165 jae L(preloop_large)
166 cmp $1024, %rdx
167 ja L(1024bytesormore)
168 prefetcht1 (%rsi)
169 prefetcht1 0x40(%rsi)
170 prefetcht1 0x80(%rsi)
171 prefetcht1 0xC0(%rsi)
172 prefetcht1 0x100(%rsi)
173 prefetcht1 0x140(%rsi)
174 prefetcht1 0x180(%rsi)
175 prefetcht1 0x1C0(%rsi)
176 prefetcht1 -0x200(%rcx)
177 prefetcht1 -0x1C0(%rcx)
178 prefetcht1 -0x180(%rcx)
179 prefetcht1 -0x140(%rcx)
180 prefetcht1 -0x100(%rcx)
181 prefetcht1 -0xC0(%rcx)
182 prefetcht1 -0x80(%rcx)
183 prefetcht1 -0x40(%rcx)
184 vmovups (%rsi), %zmm0
185 vmovups 0x40(%rsi), %zmm1
186 vmovups 0x80(%rsi), %zmm2
187 vmovups 0xC0(%rsi), %zmm3
188 vmovups 0x100(%rsi), %zmm4
189 vmovups 0x140(%rsi), %zmm5
190 vmovups 0x180(%rsi), %zmm6
191 vmovups 0x1C0(%rsi), %zmm7
192 vmovups -0x200(%rcx), %zmm8
193 vmovups -0x1C0(%rcx), %zmm9
194 vmovups -0x180(%rcx), %zmm10
195 vmovups -0x140(%rcx), %zmm11
196 vmovups -0x100(%rcx), %zmm12
197 vmovups -0xC0(%rcx), %zmm13
198 vmovups -0x80(%rcx), %zmm14
199 vmovups -0x40(%rcx), %zmm15
200 vmovups %zmm0, (%rdi)
201 vmovups %zmm1, 0x40(%rdi)
202 vmovups %zmm2, 0x80(%rdi)
203 vmovups %zmm3, 0xC0(%rdi)
204 vmovups %zmm4, 0x100(%rdi)
205 vmovups %zmm5, 0x140(%rdi)
206 vmovups %zmm6, 0x180(%rdi)
207 vmovups %zmm7, 0x1C0(%rdi)
208 vmovups %zmm8, -0x200(%r9)
209 vmovups %zmm9, -0x1C0(%r9)
210 vmovups %zmm10, -0x180(%r9)
211 vmovups %zmm11, -0x140(%r9)
212 vmovups %zmm12, -0x100(%r9)
213 vmovups %zmm13, -0xC0(%r9)
214 vmovups %zmm14, -0x80(%r9)
215 vmovups %zmm15, -0x40(%r9)
216 ret
217
218L(1024bytesormore):
219 cmp %rsi, %rdi
220 ja L(1024bytesormore_bkw)
221 sub $512, %r9
222 vmovups -0x200(%rcx), %zmm8
223 vmovups -0x1C0(%rcx), %zmm9
224 vmovups -0x180(%rcx), %zmm10
225 vmovups -0x140(%rcx), %zmm11
226 vmovups -0x100(%rcx), %zmm12
227 vmovups -0xC0(%rcx), %zmm13
228 vmovups -0x80(%rcx), %zmm14
229 vmovups -0x40(%rcx), %zmm15
230 prefetcht1 (%rsi)
231 prefetcht1 0x40(%rsi)
232 prefetcht1 0x80(%rsi)
233 prefetcht1 0xC0(%rsi)
234 prefetcht1 0x100(%rsi)
235 prefetcht1 0x140(%rsi)
236 prefetcht1 0x180(%rsi)
237 prefetcht1 0x1C0(%rsi)
238
239/* Loop with unaligned memory access. */
240L(gobble_512bytes_loop):
241 vmovups (%rsi), %zmm0
242 vmovups 0x40(%rsi), %zmm1
243 vmovups 0x80(%rsi), %zmm2
244 vmovups 0xC0(%rsi), %zmm3
245 vmovups 0x100(%rsi), %zmm4
246 vmovups 0x140(%rsi), %zmm5
247 vmovups 0x180(%rsi), %zmm6
248 vmovups 0x1C0(%rsi), %zmm7
249 add $512, %rsi
250 prefetcht1 (%rsi)
251 prefetcht1 0x40(%rsi)
252 prefetcht1 0x80(%rsi)
253 prefetcht1 0xC0(%rsi)
254 prefetcht1 0x100(%rsi)
255 prefetcht1 0x140(%rsi)
256 prefetcht1 0x180(%rsi)
257 prefetcht1 0x1C0(%rsi)
258 vmovups %zmm0, (%rdi)
259 vmovups %zmm1, 0x40(%rdi)
260 vmovups %zmm2, 0x80(%rdi)
261 vmovups %zmm3, 0xC0(%rdi)
262 vmovups %zmm4, 0x100(%rdi)
263 vmovups %zmm5, 0x140(%rdi)
264 vmovups %zmm6, 0x180(%rdi)
265 vmovups %zmm7, 0x1C0(%rdi)
266 add $512, %rdi
267 cmp %r9, %rdi
268 jb L(gobble_512bytes_loop)
269 vmovups %zmm8, (%r9)
270 vmovups %zmm9, 0x40(%r9)
271 vmovups %zmm10, 0x80(%r9)
272 vmovups %zmm11, 0xC0(%r9)
273 vmovups %zmm12, 0x100(%r9)
274 vmovups %zmm13, 0x140(%r9)
275 vmovups %zmm14, 0x180(%r9)
276 vmovups %zmm15, 0x1C0(%r9)
277 ret
278
279L(1024bytesormore_bkw):
280 add $512, %rdi
281 vmovups 0x1C0(%rsi), %zmm8
282 vmovups 0x180(%rsi), %zmm9
283 vmovups 0x140(%rsi), %zmm10
284 vmovups 0x100(%rsi), %zmm11
285 vmovups 0xC0(%rsi), %zmm12
286 vmovups 0x80(%rsi), %zmm13
287 vmovups 0x40(%rsi), %zmm14
288 vmovups (%rsi), %zmm15
289 prefetcht1 -0x40(%rcx)
290 prefetcht1 -0x80(%rcx)
291 prefetcht1 -0xC0(%rcx)
292 prefetcht1 -0x100(%rcx)
293 prefetcht1 -0x140(%rcx)
294 prefetcht1 -0x180(%rcx)
295 prefetcht1 -0x1C0(%rcx)
296 prefetcht1 -0x200(%rcx)
297
298/* Backward loop with unaligned memory access. */
299L(gobble_512bytes_loop_bkw):
300 vmovups -0x40(%rcx), %zmm0
301 vmovups -0x80(%rcx), %zmm1
302 vmovups -0xC0(%rcx), %zmm2
303 vmovups -0x100(%rcx), %zmm3
304 vmovups -0x140(%rcx), %zmm4
305 vmovups -0x180(%rcx), %zmm5
306 vmovups -0x1C0(%rcx), %zmm6
307 vmovups -0x200(%rcx), %zmm7
308 sub $512, %rcx
309 prefetcht1 -0x40(%rcx)
310 prefetcht1 -0x80(%rcx)
311 prefetcht1 -0xC0(%rcx)
312 prefetcht1 -0x100(%rcx)
313 prefetcht1 -0x140(%rcx)
314 prefetcht1 -0x180(%rcx)
315 prefetcht1 -0x1C0(%rcx)
316 prefetcht1 -0x200(%rcx)
317 vmovups %zmm0, -0x40(%r9)
318 vmovups %zmm1, -0x80(%r9)
319 vmovups %zmm2, -0xC0(%r9)
320 vmovups %zmm3, -0x100(%r9)
321 vmovups %zmm4, -0x140(%r9)
322 vmovups %zmm5, -0x180(%r9)
323 vmovups %zmm6, -0x1C0(%r9)
324 vmovups %zmm7, -0x200(%r9)
325 sub $512, %r9
326 cmp %rdi, %r9
327 ja L(gobble_512bytes_loop_bkw)
328 vmovups %zmm8, -0x40(%rdi)
329 vmovups %zmm9, -0x80(%rdi)
330 vmovups %zmm10, -0xC0(%rdi)
331 vmovups %zmm11, -0x100(%rdi)
332 vmovups %zmm12, -0x140(%rdi)
333 vmovups %zmm13, -0x180(%rdi)
334 vmovups %zmm14, -0x1C0(%rdi)
335 vmovups %zmm15, -0x200(%rdi)
336 ret
337
338L(preloop_large):
339 cmp %rsi, %rdi
340 ja L(preloop_large_bkw)
341 vmovups (%rsi), %zmm4
342 vmovups 0x40(%rsi), %zmm5
343
344 mov %rdi, %r11
345/* Align destination for access with non-temporal stores in the loop. */
346 mov %rdi, %r8
347 and $-0x80, %rdi
348 add $0x80, %rdi
349 sub %rdi, %r8
350 sub %r8, %rsi
351 add %r8, %rdx
352L(gobble_256bytes_nt_loop):
353 prefetcht1 0x200(%rsi)
354 prefetcht1 0x240(%rsi)
355 prefetcht1 0x280(%rsi)
356 prefetcht1 0x2C0(%rsi)
357 prefetcht1 0x300(%rsi)
358 prefetcht1 0x340(%rsi)
359 prefetcht1 0x380(%rsi)
360 prefetcht1 0x3C0(%rsi)
361 vmovdqu64 (%rsi), %zmm0
362 vmovdqu64 0x40(%rsi), %zmm1
363 vmovdqu64 0x80(%rsi), %zmm2
364 vmovdqu64 0xC0(%rsi), %zmm3
365 vmovntdq %zmm0, (%rdi)
366 vmovntdq %zmm1, 0x40(%rdi)
367 vmovntdq %zmm2, 0x80(%rdi)
368 vmovntdq %zmm3, 0xC0(%rdi)
369 sub $256, %rdx
370 add $256, %rsi
371 add $256, %rdi
372 cmp $256, %rdx
373 ja L(gobble_256bytes_nt_loop)
374 sfence
375 vmovups %zmm4, (%r11)
376 vmovups %zmm5, 0x40(%r11)
377 jmp L(check)
378
379L(preloop_large_bkw):
380 vmovups -0x80(%rcx), %zmm4
381 vmovups -0x40(%rcx), %zmm5
382
383/* Align end of destination for access with non-temporal stores. */
384 mov %r9, %r8
385 and $-0x80, %r9
386 sub %r9, %r8
387 sub %r8, %rcx
388 sub %r8, %rdx
389 add %r9, %r8
390L(gobble_256bytes_nt_loop_bkw):
391 prefetcht1 -0x400(%rcx)
392 prefetcht1 -0x3C0(%rcx)
393 prefetcht1 -0x380(%rcx)
394 prefetcht1 -0x340(%rcx)
395 prefetcht1 -0x300(%rcx)
396 prefetcht1 -0x2C0(%rcx)
397 prefetcht1 -0x280(%rcx)
398 prefetcht1 -0x240(%rcx)
399 vmovdqu64 -0x100(%rcx), %zmm0
400 vmovdqu64 -0xC0(%rcx), %zmm1
401 vmovdqu64 -0x80(%rcx), %zmm2
402 vmovdqu64 -0x40(%rcx), %zmm3
403 vmovntdq %zmm0, -0x100(%r9)
404 vmovntdq %zmm1, -0xC0(%r9)
405 vmovntdq %zmm2, -0x80(%r9)
406 vmovntdq %zmm3, -0x40(%r9)
407 sub $256, %rdx
408 sub $256, %rcx
409 sub $256, %r9
410 cmp $256, %rdx
411 ja L(gobble_256bytes_nt_loop_bkw)
412 sfence
413 vmovups %zmm4, -0x80(%r8)
414 vmovups %zmm5, -0x40(%r8)
415 jmp L(check)
416END (__memmove_avx512_no_vzeroupper)
417
418strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
419strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
420#endif
421

source code of glibc/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S