1/* wcscpy with SSSE3
2 Copyright (C) 2011-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21/* MINIMUM_X86_ISA_LEVEL <= 4 because there are not V3/V4
22 implementations so we need this to build for ISA V3/V4
23 builds. */
24#if ISA_SHOULD_BUILD (4)
25
26# ifndef WCSCPY
27# define WCSCPY __wcscpy_ssse3
28# endif
29
30# include <sysdep.h>
31
32 .section .text.ssse3,"ax",@progbits
33ENTRY (WCSCPY)
34
35 mov %rsi, %rcx
36 mov %rdi, %rdx
37
38 cmpl $0, (%rcx)
39 jz L(Exit4)
40 cmpl $0, 4(%rcx)
41 jz L(Exit8)
42 cmpl $0, 8(%rcx)
43 jz L(Exit12)
44 cmpl $0, 12(%rcx)
45 jz L(Exit16)
46
47 lea 16(%rcx), %rsi
48 and $-16, %rsi
49
50 pxor %xmm0, %xmm0
51 mov (%rcx), %r9
52 mov %r9, (%rdx)
53
54 pcmpeqd (%rsi), %xmm0
55 mov 8(%rcx), %r9
56 mov %r9, 8(%rdx)
57
58 pmovmskb %xmm0, %rax
59 sub %rcx, %rsi
60
61 test %rax, %rax
62 jnz L(CopyFrom1To16Bytes)
63
64 mov %rdx, %rax
65 addq $16, %rdx
66 and $-16, %rdx
67 sub %rdx, %rax
68 sub %rax, %rcx
69 mov %rcx, %rax
70 and $0xf, %rax
71 mov $0, %rsi
72
73/* case: rcx_offset == rdx_offset */
74
75 jz L(Align16Both)
76
77 cmp $4, %rax
78 je L(Shl4)
79 cmp $8, %rax
80 je L(Shl8)
81 jmp L(Shl12)
82
83L(Align16Both):
84 movaps (%rcx), %xmm1
85 movaps 16(%rcx), %xmm2
86 movaps %xmm1, (%rdx)
87 pcmpeqd %xmm2, %xmm0
88 pmovmskb %xmm0, %eax
89 addq $16, %rsi
90
91 test %eax, %eax
92 jnz L(CopyFrom1To16Bytes)
93
94 movaps 16(%rcx, %rsi), %xmm3
95 movaps %xmm2, (%rdx, %rsi)
96 pcmpeqd %xmm3, %xmm0
97 pmovmskb %xmm0, %eax
98 addq $16, %rsi
99
100 test %eax, %eax
101 jnz L(CopyFrom1To16Bytes)
102
103 movaps 16(%rcx, %rsi), %xmm4
104 movaps %xmm3, (%rdx, %rsi)
105 pcmpeqd %xmm4, %xmm0
106 pmovmskb %xmm0, %eax
107 addq $16, %rsi
108
109 test %eax, %eax
110 jnz L(CopyFrom1To16Bytes)
111
112 movaps 16(%rcx, %rsi), %xmm1
113 movaps %xmm4, (%rdx, %rsi)
114 pcmpeqd %xmm1, %xmm0
115 pmovmskb %xmm0, %eax
116 addq $16, %rsi
117
118 test %eax, %eax
119 jnz L(CopyFrom1To16Bytes)
120
121 movaps 16(%rcx, %rsi), %xmm2
122 movaps %xmm1, (%rdx, %rsi)
123 pcmpeqd %xmm2, %xmm0
124 pmovmskb %xmm0, %eax
125 addq $16, %rsi
126
127 test %eax, %eax
128 jnz L(CopyFrom1To16Bytes)
129
130 movaps 16(%rcx, %rsi), %xmm3
131 movaps %xmm2, (%rdx, %rsi)
132 pcmpeqd %xmm3, %xmm0
133 pmovmskb %xmm0, %eax
134 addq $16, %rsi
135
136 test %eax, %eax
137 jnz L(CopyFrom1To16Bytes)
138
139 movaps %xmm3, (%rdx, %rsi)
140 mov %rcx, %rax
141 lea 16(%rcx, %rsi), %rcx
142 and $-0x40, %rcx
143 sub %rcx, %rax
144 sub %rax, %rdx
145
146 mov $-0x40, %rsi
147
148 .p2align 4
149L(Aligned64Loop):
150 movaps (%rcx), %xmm2
151 movaps %xmm2, %xmm4
152 movaps 16(%rcx), %xmm5
153 movaps 32(%rcx), %xmm3
154 movaps %xmm3, %xmm6
155 movaps 48(%rcx), %xmm7
156 pminub %xmm5, %xmm2
157 pminub %xmm7, %xmm3
158 pminub %xmm2, %xmm3
159 pcmpeqd %xmm0, %xmm3
160 pmovmskb %xmm3, %eax
161 addq $64, %rdx
162 addq $64, %rcx
163 testl %eax, %eax
164 jnz L(Aligned64Leave)
165 movaps %xmm4, -64(%rdx)
166 movaps %xmm5, -48(%rdx)
167 movaps %xmm6, -32(%rdx)
168 movaps %xmm7, -16(%rdx)
169 jmp L(Aligned64Loop)
170
171L(Aligned64Leave):
172 pcmpeqd %xmm4, %xmm0
173 pmovmskb %xmm0, %eax
174 test %eax, %eax
175 jnz L(CopyFrom1To16Bytes)
176
177 pcmpeqd %xmm5, %xmm0
178
179 pmovmskb %xmm0, %eax
180 movaps %xmm4, -64(%rdx)
181 addq $16, %rsi
182 test %eax, %eax
183 jnz L(CopyFrom1To16Bytes)
184
185 pcmpeqd %xmm6, %xmm0
186
187 pmovmskb %xmm0, %eax
188 movaps %xmm5, -48(%rdx)
189 addq $16, %rsi
190 test %eax, %eax
191 jnz L(CopyFrom1To16Bytes)
192
193 movaps %xmm6, -32(%rdx)
194 pcmpeqd %xmm7, %xmm0
195
196 pmovmskb %xmm0, %eax
197 addq $16, %rsi
198 test %eax, %eax
199 jnz L(CopyFrom1To16Bytes)
200
201 mov $-0x40, %rsi
202 movaps %xmm7, -16(%rdx)
203 jmp L(Aligned64Loop)
204
205 .p2align 4
206L(Shl4):
207 movaps -4(%rcx), %xmm1
208 movaps 12(%rcx), %xmm2
209L(Shl4Start):
210 pcmpeqd %xmm2, %xmm0
211 pmovmskb %xmm0, %eax
212 movaps %xmm2, %xmm3
213
214 test %eax, %eax
215 jnz L(Shl4LoopExit)
216
217 palignr $4, %xmm1, %xmm2
218 movaps %xmm2, (%rdx)
219 movaps 28(%rcx), %xmm2
220
221 pcmpeqd %xmm2, %xmm0
222 addq $16, %rdx
223 pmovmskb %xmm0, %eax
224 addq $16, %rcx
225 movaps %xmm2, %xmm1
226
227 test %eax, %eax
228 jnz L(Shl4LoopExit)
229
230 palignr $4, %xmm3, %xmm2
231 movaps %xmm2, (%rdx)
232 movaps 28(%rcx), %xmm2
233
234 pcmpeqd %xmm2, %xmm0
235 addq $16, %rdx
236 pmovmskb %xmm0, %eax
237 addq $16, %rcx
238 movaps %xmm2, %xmm3
239
240 test %eax, %eax
241 jnz L(Shl4LoopExit)
242
243 palignr $4, %xmm1, %xmm2
244 movaps %xmm2, (%rdx)
245 movaps 28(%rcx), %xmm2
246
247 pcmpeqd %xmm2, %xmm0
248 addq $16, %rdx
249 pmovmskb %xmm0, %eax
250 addq $16, %rcx
251
252 test %eax, %eax
253 jnz L(Shl4LoopExit)
254
255 palignr $4, %xmm3, %xmm2
256 movaps %xmm2, (%rdx)
257 addq $28, %rcx
258 addq $16, %rdx
259
260 mov %rcx, %rax
261 and $-0x40, %rcx
262 sub %rcx, %rax
263 addq $-12, %rcx
264 sub %rax, %rdx
265
266 movaps -4(%rcx), %xmm1
267
268 .p2align 4
269L(Shl4LoopStart):
270 movaps 12(%rcx), %xmm2
271 movaps 28(%rcx), %xmm3
272 movaps %xmm3, %xmm6
273 movaps 44(%rcx), %xmm4
274 movaps %xmm4, %xmm7
275 movaps 60(%rcx), %xmm5
276 pminub %xmm2, %xmm6
277 pminub %xmm5, %xmm7
278 pminub %xmm6, %xmm7
279 pcmpeqd %xmm0, %xmm7
280 pmovmskb %xmm7, %eax
281 movaps %xmm5, %xmm7
282 palignr $4, %xmm4, %xmm5
283 palignr $4, %xmm3, %xmm4
284 test %eax, %eax
285 jnz L(Shl4Start)
286
287 palignr $4, %xmm2, %xmm3
288 addq $64, %rcx
289 palignr $4, %xmm1, %xmm2
290 movaps %xmm7, %xmm1
291 movaps %xmm5, 48(%rdx)
292 movaps %xmm4, 32(%rdx)
293 movaps %xmm3, 16(%rdx)
294 movaps %xmm2, (%rdx)
295 addq $64, %rdx
296 jmp L(Shl4LoopStart)
297
298L(Shl4LoopExit):
299 movdqu -4(%rcx), %xmm1
300 mov $12, %rsi
301 movdqu %xmm1, -4(%rdx)
302 jmp L(CopyFrom1To16Bytes)
303
304 .p2align 4
305L(Shl8):
306 movaps -8(%rcx), %xmm1
307 movaps 8(%rcx), %xmm2
308L(Shl8Start):
309 pcmpeqd %xmm2, %xmm0
310 pmovmskb %xmm0, %eax
311 movaps %xmm2, %xmm3
312
313 test %eax, %eax
314 jnz L(Shl8LoopExit)
315
316 palignr $8, %xmm1, %xmm2
317 movaps %xmm2, (%rdx)
318 movaps 24(%rcx), %xmm2
319
320 pcmpeqd %xmm2, %xmm0
321 addq $16, %rdx
322 pmovmskb %xmm0, %eax
323 addq $16, %rcx
324 movaps %xmm2, %xmm1
325
326 test %eax, %eax
327 jnz L(Shl8LoopExit)
328
329 palignr $8, %xmm3, %xmm2
330 movaps %xmm2, (%rdx)
331 movaps 24(%rcx), %xmm2
332
333 pcmpeqd %xmm2, %xmm0
334 addq $16, %rdx
335 pmovmskb %xmm0, %eax
336 addq $16, %rcx
337 movaps %xmm2, %xmm3
338
339 test %eax, %eax
340 jnz L(Shl8LoopExit)
341
342 palignr $8, %xmm1, %xmm2
343 movaps %xmm2, (%rdx)
344 movaps 24(%rcx), %xmm2
345
346 pcmpeqd %xmm2, %xmm0
347 addq $16, %rdx
348 pmovmskb %xmm0, %eax
349 addq $16, %rcx
350
351 test %eax, %eax
352 jnz L(Shl8LoopExit)
353
354 palignr $8, %xmm3, %xmm2
355 movaps %xmm2, (%rdx)
356 addq $24, %rcx
357 addq $16, %rdx
358
359 mov %rcx, %rax
360 and $-0x40, %rcx
361 sub %rcx, %rax
362 addq $-8, %rcx
363 sub %rax, %rdx
364
365 movaps -8(%rcx), %xmm1
366
367 .p2align 4
368L(Shl8LoopStart):
369 movaps 8(%rcx), %xmm2
370 movaps 24(%rcx), %xmm3
371 movaps %xmm3, %xmm6
372 movaps 40(%rcx), %xmm4
373 movaps %xmm4, %xmm7
374 movaps 56(%rcx), %xmm5
375 pminub %xmm2, %xmm6
376 pminub %xmm5, %xmm7
377 pminub %xmm6, %xmm7
378 pcmpeqd %xmm0, %xmm7
379 pmovmskb %xmm7, %eax
380 movaps %xmm5, %xmm7
381 palignr $8, %xmm4, %xmm5
382 palignr $8, %xmm3, %xmm4
383 test %eax, %eax
384 jnz L(Shl8Start)
385
386 palignr $8, %xmm2, %xmm3
387 addq $64, %rcx
388 palignr $8, %xmm1, %xmm2
389 movaps %xmm7, %xmm1
390 movaps %xmm5, 48(%rdx)
391 movaps %xmm4, 32(%rdx)
392 movaps %xmm3, 16(%rdx)
393 movaps %xmm2, (%rdx)
394 addq $64, %rdx
395 jmp L(Shl8LoopStart)
396
397L(Shl8LoopExit):
398 mov (%rcx), %r9
399 mov $8, %rsi
400 mov %r9, (%rdx)
401 jmp L(CopyFrom1To16Bytes)
402
403 .p2align 4
404L(Shl12):
405 movaps -12(%rcx), %xmm1
406 movaps 4(%rcx), %xmm2
407L(Shl12Start):
408 pcmpeqd %xmm2, %xmm0
409 pmovmskb %xmm0, %eax
410 movaps %xmm2, %xmm3
411
412 test %eax, %eax
413 jnz L(Shl12LoopExit)
414
415 palignr $12, %xmm1, %xmm2
416 movaps %xmm2, (%rdx)
417 movaps 20(%rcx), %xmm2
418
419 pcmpeqd %xmm2, %xmm0
420 addq $16, %rdx
421 pmovmskb %xmm0, %eax
422 addq $16, %rcx
423 movaps %xmm2, %xmm1
424
425 test %eax, %eax
426 jnz L(Shl12LoopExit)
427
428 palignr $12, %xmm3, %xmm2
429 movaps %xmm2, (%rdx)
430 movaps 20(%rcx), %xmm2
431
432 pcmpeqd %xmm2, %xmm0
433 addq $16, %rdx
434 pmovmskb %xmm0, %eax
435 addq $16, %rcx
436 movaps %xmm2, %xmm3
437
438 test %eax, %eax
439 jnz L(Shl12LoopExit)
440
441 palignr $12, %xmm1, %xmm2
442 movaps %xmm2, (%rdx)
443 movaps 20(%rcx), %xmm2
444
445 pcmpeqd %xmm2, %xmm0
446 addq $16, %rdx
447 pmovmskb %xmm0, %eax
448 addq $16, %rcx
449
450 test %eax, %eax
451 jnz L(Shl12LoopExit)
452
453 palignr $12, %xmm3, %xmm2
454 movaps %xmm2, (%rdx)
455 addq $20, %rcx
456 addq $16, %rdx
457
458 mov %rcx, %rax
459 and $-0x40, %rcx
460 sub %rcx, %rax
461 addq $-4, %rcx
462 sub %rax, %rdx
463
464 movaps -12(%rcx), %xmm1
465
466 .p2align 4
467L(Shl12LoopStart):
468 movaps 4(%rcx), %xmm2
469 movaps 20(%rcx), %xmm3
470 movaps %xmm3, %xmm6
471 movaps 36(%rcx), %xmm4
472 movaps %xmm4, %xmm7
473 movaps 52(%rcx), %xmm5
474 pminub %xmm2, %xmm6
475 pminub %xmm5, %xmm7
476 pminub %xmm6, %xmm7
477 pcmpeqd %xmm0, %xmm7
478 pmovmskb %xmm7, %eax
479 movaps %xmm5, %xmm7
480 palignr $12, %xmm4, %xmm5
481 palignr $12, %xmm3, %xmm4
482 test %eax, %eax
483 jnz L(Shl12Start)
484 palignr $12, %xmm2, %xmm3
485 addq $64, %rcx
486 palignr $12, %xmm1, %xmm2
487 movaps %xmm7, %xmm1
488 movaps %xmm5, 48(%rdx)
489 movaps %xmm4, 32(%rdx)
490 movaps %xmm3, 16(%rdx)
491 movaps %xmm2, (%rdx)
492 addq $64, %rdx
493 jmp L(Shl12LoopStart)
494
495L(Shl12LoopExit):
496 mov (%rcx), %r9d
497 mov $4, %rsi
498 mov %r9d, (%rdx)
499 jmp L(CopyFrom1To16Bytes)
500
501 .p2align 4
502L(CopyFrom1To16Bytes):
503 add %rsi, %rdx
504 add %rsi, %rcx
505
506 test %al, %al
507 jz L(ExitHigh)
508 test $0x01, %al
509 jnz L(Exit4)
510
511 mov (%rcx), %rax
512 mov %rax, (%rdx)
513 mov %rdi, %rax
514 ret
515
516 .p2align 4
517L(ExitHigh):
518 test $0x01, %ah
519 jnz L(Exit12)
520
521 mov (%rcx), %rax
522 mov %rax, (%rdx)
523 mov 8(%rcx), %rax
524 mov %rax, 8(%rdx)
525 mov %rdi, %rax
526 ret
527
528 .p2align 4
529L(Exit4):
530 movl (%rcx), %eax
531 movl %eax, (%rdx)
532 mov %rdi, %rax
533 ret
534
535 .p2align 4
536L(Exit8):
537 mov (%rcx), %rax
538 mov %rax, (%rdx)
539 mov %rdi, %rax
540 ret
541
542 .p2align 4
543L(Exit12):
544 mov (%rcx), %rax
545 mov %rax, (%rdx)
546 mov 8(%rcx), %eax
547 mov %eax, 8(%rdx)
548 mov %rdi, %rax
549 ret
550
551 .p2align 4
552L(Exit16):
553 mov (%rcx), %rax
554 mov %rax, (%rdx)
555 mov 8(%rcx), %rax
556 mov %rax, 8(%rdx)
557 mov %rdi, %rax
558 ret
559
560END(WCSCPY)
561#endif
562

source code of glibc/sysdeps/x86_64/multiarch/wcscpy-ssse3.S