1/* memmove/memcpy/mempcpy optimized for aligned access with SSSE3.
2 All versions must be listed in ifunc-impl-list.c.
3 Copyright (C) 2022-2024 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20
21#include <isa-level.h>
22
23#if ISA_SHOULD_BUILD (2)
24
25# include <sysdep.h>
26# ifndef MEMMOVE
27# define MEMMOVE __memmove_ssse3
28# define MEMMOVE_CHK __memmove_chk_ssse3
29# define MEMCPY __memcpy_ssse3
30# define MEMCPY_CHK __memcpy_chk_ssse3
31# define MEMPCPY __mempcpy_ssse3
32# define MEMPCPY_CHK __mempcpy_chk_ssse3
33# endif
34
35 .section .text.ssse3, "ax", @progbits
36# if defined SHARED
37ENTRY(MEMPCPY_CHK)
38 cmp %RDX_LP, %RCX_LP
39 jb HIDDEN_JUMPTARGET(__chk_fail)
40END(MEMPCPY_CHK)
41# endif
42
43ENTRY(MEMPCPY)
44 mov %RDI_LP, %RAX_LP
45 add %RDX_LP, %RAX_LP
46 jmp L(start)
47END(MEMPCPY)
48
49# if defined SHARED
50ENTRY(MEMMOVE_CHK)
51 cmp %RDX_LP, %RCX_LP
52 jb HIDDEN_JUMPTARGET(__chk_fail)
53END(MEMMOVE_CHK)
54# endif
55
56ENTRY_P2ALIGN(MEMMOVE, 6)
57# ifdef __ILP32__
58 /* Clear the upper 32 bits. */
59 movl %edx, %edx
60# endif
61 movq %rdi, %rax
62L(start):
63 cmpq $16, %rdx
64 jb L(copy_0_15)
65
66 /* These loads are always useful. */
67 movups 0(%rsi), %xmm0
68 movups -16(%rsi, %rdx), %xmm7
69 cmpq $32, %rdx
70 ja L(more_2x_vec)
71
72 movups %xmm0, 0(%rdi)
73 movups %xmm7, -16(%rdi, %rdx)
74 ret
75
76 .p2align 4,, 4
77L(copy_0_15):
78 cmpl $4, %edx
79 jb L(copy_0_3)
80 cmpl $8, %edx
81 jb L(copy_4_7)
82 movq 0(%rsi), %rcx
83 movq -8(%rsi, %rdx), %rsi
84 movq %rcx, 0(%rdi)
85 movq %rsi, -8(%rdi, %rdx)
86 ret
87
88 .p2align 4,, 4
89L(copy_4_7):
90 movl 0(%rsi), %ecx
91 movl -4(%rsi, %rdx), %esi
92 movl %ecx, 0(%rdi)
93 movl %esi, -4(%rdi, %rdx)
94 ret
95
96 .p2align 4,, 4
97L(copy_0_3):
98 decl %edx
99 jl L(copy_0_0)
100 movb (%rsi), %cl
101 je L(copy_1_1)
102
103 movzwl -1(%rsi, %rdx), %esi
104 movw %si, -1(%rdi, %rdx)
105L(copy_1_1):
106 movb %cl, (%rdi)
107L(copy_0_0):
108 ret
109
110 .p2align 4,, 4
111L(copy_4x_vec):
112 movups 16(%rsi), %xmm1
113 movups -32(%rsi, %rdx), %xmm2
114
115 movups %xmm0, 0(%rdi)
116 movups %xmm1, 16(%rdi)
117 movups %xmm2, -32(%rdi, %rdx)
118 movups %xmm7, -16(%rdi, %rdx)
119L(nop):
120 ret
121
122 .p2align 4
123L(more_2x_vec):
124 cmpq $64, %rdx
125 jbe L(copy_4x_vec)
126
127 /* We use rcx later to get alignr value. */
128 movq %rdi, %rcx
129
130 /* Backward copy for overlap + dst > src for memmove safety. */
131 subq %rsi, %rcx
132 cmpq %rdx, %rcx
133 jb L(copy_backward)
134
135 /* Load tail. */
136
137 /* -16(%rsi, %rdx) already loaded into xmm7. */
138 movups -32(%rsi, %rdx), %xmm8
139 movups -48(%rsi, %rdx), %xmm9
140
141 /* Get misalignment. */
142 andl $0xf, %ecx
143
144 movq %rsi, %r9
145 addq %rcx, %rsi
146 andq $-16, %rsi
147 /* Get first vec for `palignr`. */
148 movaps (%rsi), %xmm1
149
150 /* We have loaded (%rsi) so safe to do this store before the
151 loop. */
152 movups %xmm0, (%rdi)
153
154# ifdef SHARED_CACHE_SIZE_HALF
155 cmp $SHARED_CACHE_SIZE_HALF, %RDX_LP
156# else
157 cmp __x86_shared_cache_size_half(%rip), %rdx
158# endif
159 ja L(large_memcpy)
160
161 leaq -64(%rdi, %rdx), %r8
162 andq $-16, %rdi
163 movl $48, %edx
164
165 leaq L(loop_fwd_start)(%rip), %r9
166 sall $6, %ecx
167 addq %r9, %rcx
168 jmp * %rcx
169
170 .p2align 4,, 8
171L(copy_backward):
172 testq %rcx, %rcx
173 jz L(nop)
174
175 /* Preload tail. */
176
177 /* (%rsi) already loaded into xmm0. */
178 movups 16(%rsi), %xmm4
179 movups 32(%rsi), %xmm5
180
181 movq %rdi, %r8
182 subq %rdi, %rsi
183 leaq -49(%rdi, %rdx), %rdi
184 andq $-16, %rdi
185 addq %rdi, %rsi
186 andq $-16, %rsi
187
188 movaps 48(%rsi), %xmm6
189
190
191 leaq L(loop_bkwd_start)(%rip), %r9
192 andl $0xf, %ecx
193 sall $6, %ecx
194 addq %r9, %rcx
195 jmp * %rcx
196
197 .p2align 4,, 8
198L(large_memcpy):
199 movups -64(%r9, %rdx), %xmm10
200 movups -80(%r9, %rdx), %xmm11
201
202 sall $5, %ecx
203 leal (%rcx, %rcx, 2), %r8d
204 leaq -96(%rdi, %rdx), %rcx
205 andq $-16, %rdi
206 leaq L(large_loop_fwd_start)(%rip), %rdx
207 addq %r8, %rdx
208 jmp * %rdx
209
210
211 /* Instead of a typical jump table all 16 loops are exactly
212 64-bytes in size. So, we can just jump to first loop + r8 *
213 64. Before modifying any loop ensure all their sizes match!
214 */
215 .p2align 6
216L(loop_fwd_start):
217L(loop_fwd_0x0):
218 movaps 16(%rsi), %xmm1
219 movaps 32(%rsi), %xmm2
220 movaps 48(%rsi), %xmm3
221 movaps %xmm1, 16(%rdi)
222 movaps %xmm2, 32(%rdi)
223 movaps %xmm3, 48(%rdi)
224 addq %rdx, %rdi
225 addq %rdx, %rsi
226 cmpq %rdi, %r8
227 ja L(loop_fwd_0x0)
228L(end_loop_fwd):
229 movups %xmm9, 16(%r8)
230 movups %xmm8, 32(%r8)
231 movups %xmm7, 48(%r8)
232 ret
233
234 /* Exactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
235 60 bytes otherwise. */
236# define ALIGNED_LOOP_FWD(align_by); \
237 .p2align 6; \
238L(loop_fwd_ ## align_by): \
239 movaps 16(%rsi), %xmm0; \
240 movaps 32(%rsi), %xmm2; \
241 movaps 48(%rsi), %xmm3; \
242 movaps %xmm3, %xmm4; \
243 palignr $align_by, %xmm2, %xmm3; \
244 palignr $align_by, %xmm0, %xmm2; \
245 palignr $align_by, %xmm1, %xmm0; \
246 movaps %xmm4, %xmm1; \
247 movaps %xmm0, 16(%rdi); \
248 movaps %xmm2, 32(%rdi); \
249 movaps %xmm3, 48(%rdi); \
250 addq %rdx, %rdi; \
251 addq %rdx, %rsi; \
252 cmpq %rdi, %r8; \
253 ja L(loop_fwd_ ## align_by); \
254 jmp L(end_loop_fwd);
255
256 /* Must be in descending order. */
257 ALIGNED_LOOP_FWD (0xf)
258 ALIGNED_LOOP_FWD (0xe)
259 ALIGNED_LOOP_FWD (0xd)
260 ALIGNED_LOOP_FWD (0xc)
261 ALIGNED_LOOP_FWD (0xb)
262 ALIGNED_LOOP_FWD (0xa)
263 ALIGNED_LOOP_FWD (0x9)
264 ALIGNED_LOOP_FWD (0x8)
265 ALIGNED_LOOP_FWD (0x7)
266 ALIGNED_LOOP_FWD (0x6)
267 ALIGNED_LOOP_FWD (0x5)
268 ALIGNED_LOOP_FWD (0x4)
269 ALIGNED_LOOP_FWD (0x3)
270 ALIGNED_LOOP_FWD (0x2)
271 ALIGNED_LOOP_FWD (0x1)
272
273 .p2align 6
274L(large_loop_fwd_start):
275L(large_loop_fwd_0x0):
276 movaps 16(%rsi), %xmm1
277 movaps 32(%rsi), %xmm2
278 movaps 48(%rsi), %xmm3
279 movaps 64(%rsi), %xmm4
280 movaps 80(%rsi), %xmm5
281 movntps %xmm1, 16(%rdi)
282 movntps %xmm2, 32(%rdi)
283 movntps %xmm3, 48(%rdi)
284 movntps %xmm4, 64(%rdi)
285 movntps %xmm5, 80(%rdi)
286 addq $80, %rdi
287 addq $80, %rsi
288 cmpq %rdi, %rcx
289 ja L(large_loop_fwd_0x0)
290
291 /* Ensure no icache line split on tail. */
292 .p2align 4
293L(end_large_loop_fwd):
294 sfence
295 movups %xmm11, 16(%rcx)
296 movups %xmm10, 32(%rcx)
297 movups %xmm9, 48(%rcx)
298 movups %xmm8, 64(%rcx)
299 movups %xmm7, 80(%rcx)
300 ret
301
302
303 /* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure
304 96-byte spacing between each. */
305# define ALIGNED_LARGE_LOOP_FWD(align_by); \
306 .p2align 5; \
307L(large_loop_fwd_ ## align_by): \
308 movaps 16(%rsi), %xmm0; \
309 movaps 32(%rsi), %xmm2; \
310 movaps 48(%rsi), %xmm3; \
311 movaps 64(%rsi), %xmm4; \
312 movaps 80(%rsi), %xmm5; \
313 movaps %xmm5, %xmm6; \
314 palignr $align_by, %xmm4, %xmm5; \
315 palignr $align_by, %xmm3, %xmm4; \
316 palignr $align_by, %xmm2, %xmm3; \
317 palignr $align_by, %xmm0, %xmm2; \
318 palignr $align_by, %xmm1, %xmm0; \
319 movaps %xmm6, %xmm1; \
320 movntps %xmm0, 16(%rdi); \
321 movntps %xmm2, 32(%rdi); \
322 movntps %xmm3, 48(%rdi); \
323 movntps %xmm4, 64(%rdi); \
324 movntps %xmm5, 80(%rdi); \
325 addq $80, %rdi; \
326 addq $80, %rsi; \
327 cmpq %rdi, %rcx; \
328 ja L(large_loop_fwd_ ## align_by); \
329 jmp L(end_large_loop_fwd);
330
331 /* Must be in descending order. */
332 ALIGNED_LARGE_LOOP_FWD (0xf)
333 ALIGNED_LARGE_LOOP_FWD (0xe)
334 ALIGNED_LARGE_LOOP_FWD (0xd)
335 ALIGNED_LARGE_LOOP_FWD (0xc)
336 ALIGNED_LARGE_LOOP_FWD (0xb)
337 ALIGNED_LARGE_LOOP_FWD (0xa)
338 ALIGNED_LARGE_LOOP_FWD (0x9)
339 ALIGNED_LARGE_LOOP_FWD (0x8)
340 ALIGNED_LARGE_LOOP_FWD (0x7)
341 ALIGNED_LARGE_LOOP_FWD (0x6)
342 ALIGNED_LARGE_LOOP_FWD (0x5)
343 ALIGNED_LARGE_LOOP_FWD (0x4)
344 ALIGNED_LARGE_LOOP_FWD (0x3)
345 ALIGNED_LARGE_LOOP_FWD (0x2)
346 ALIGNED_LARGE_LOOP_FWD (0x1)
347
348
349 .p2align 6
350L(loop_bkwd_start):
351L(loop_bkwd_0x0):
352 movaps 32(%rsi), %xmm1
353 movaps 16(%rsi), %xmm2
354 movaps 0(%rsi), %xmm3
355 movaps %xmm1, 32(%rdi)
356 movaps %xmm2, 16(%rdi)
357 movaps %xmm3, 0(%rdi)
358 subq $48, %rdi
359 subq $48, %rsi
360 cmpq %rdi, %r8
361 jb L(loop_bkwd_0x0)
362L(end_loop_bkwd):
363 movups %xmm7, -16(%r8, %rdx)
364 movups %xmm0, 0(%r8)
365 movups %xmm4, 16(%r8)
366 movups %xmm5, 32(%r8)
367
368 ret
369
370
371 /* Exactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
372 60 bytes otherwise. */
373# define ALIGNED_LOOP_BKWD(align_by); \
374 .p2align 6; \
375L(loop_bkwd_ ## align_by): \
376 movaps 32(%rsi), %xmm1; \
377 movaps 16(%rsi), %xmm2; \
378 movaps 0(%rsi), %xmm3; \
379 palignr $align_by, %xmm1, %xmm6; \
380 palignr $align_by, %xmm2, %xmm1; \
381 palignr $align_by, %xmm3, %xmm2; \
382 movaps %xmm6, 32(%rdi); \
383 movaps %xmm1, 16(%rdi); \
384 movaps %xmm2, 0(%rdi); \
385 subq $48, %rdi; \
386 subq $48, %rsi; \
387 movaps %xmm3, %xmm6; \
388 cmpq %rdi, %r8; \
389 jb L(loop_bkwd_ ## align_by); \
390 jmp L(end_loop_bkwd);
391
392 /* Must be in descending order. */
393 ALIGNED_LOOP_BKWD (0xf)
394 ALIGNED_LOOP_BKWD (0xe)
395 ALIGNED_LOOP_BKWD (0xd)
396 ALIGNED_LOOP_BKWD (0xc)
397 ALIGNED_LOOP_BKWD (0xb)
398 ALIGNED_LOOP_BKWD (0xa)
399 ALIGNED_LOOP_BKWD (0x9)
400 ALIGNED_LOOP_BKWD (0x8)
401 ALIGNED_LOOP_BKWD (0x7)
402 ALIGNED_LOOP_BKWD (0x6)
403 ALIGNED_LOOP_BKWD (0x5)
404 ALIGNED_LOOP_BKWD (0x4)
405 ALIGNED_LOOP_BKWD (0x3)
406 ALIGNED_LOOP_BKWD (0x2)
407 ALIGNED_LOOP_BKWD (0x1)
408END(MEMMOVE)
409
410strong_alias (MEMMOVE, MEMCPY)
411# if defined SHARED
412strong_alias (MEMMOVE_CHK, MEMCPY_CHK)
413# endif
414#endif
415

source code of glibc/sysdeps/x86_64/multiarch/memmove-ssse3.S