1/* strncpy with AVX2
2 Copyright (C) 2022-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (3)
22
23# include <sysdep.h>
24
25
26# ifndef VEC_SIZE
27# include "x86-avx-vecs.h"
28# endif
29
30# ifndef STRNCPY
31# define STRNCPY __strncpy_avx2
32# endif
33
34
35# ifdef USE_AS_WCSCPY
36# define VPCMPEQ vpcmpeqd
37# define VPMIN vpminud
38# define CHAR_SIZE 4
39# else
40# define VPCMPEQ vpcmpeqb
41# define VPMIN vpminub
42# define CHAR_SIZE 1
43# endif
44
45# include "strncpy-or-cat-overflow-def.h"
46
47# define PAGE_SIZE 4096
48
49# define VZERO VMM(7)
50# define VZERO_128 VMM_128(7)
51
52
53 .section SECTION(.text), "ax", @progbits
54ENTRY(STRNCPY)
55# ifdef __ILP32__
56 /* Clear the upper 32 bits. */
57 movl %edx, %edx
58# endif
59 /* Filter zero length strings and very long strings. Zero
60 length strings just return, very long strings are handled by
61 just running rep stos{b|l} to zero set (which will almost
62 certainly segfault), if that succeeds then just calling
63 OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */
64# ifdef USE_AS_WCSCPY
65 decq %rdx
66 movq %rdx, %rax
67 /* 56 is end of max supported address space. */
68 shr $56, %rax
69 jnz L(zero_len)
70 salq $2, %rdx
71# else
72 decq %rdx
73 /* `dec` can macrofuse with `jl`. If the flag needs to become
74 `jb` replace `dec` with `sub`. */
75 jl L(zero_len)
76# endif
77
78 vpxor %VZERO_128, %VZERO_128, %VZERO_128
79 movl %esi, %eax
80 andl $(PAGE_SIZE - 1), %eax
81 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
82 ja L(page_cross)
83
84L(page_cross_continue):
85 VMOVU (%rsi), %VMM(0)
86 VPCMPEQ %VMM(0), %VZERO, %VMM(6)
87 vpmovmskb %VMM(6), %ecx
88
89 /* If no STPCPY just save end ahead of time. */
90# ifndef USE_AS_STPCPY
91 movq %rdi, %rax
92# elif defined USE_AS_WCSCPY
93 /* Clear dependency as nearly all return code for wcpncpy uses
94 `setc %al`. */
95 xorl %eax, %eax
96# endif
97
98 cmpq $(VEC_SIZE - CHAR_SIZE), %rdx
99 /* `jb` because length rdx is now length - CHAR_SIZE. */
100 jbe L(less_1x_vec)
101
102 /* This may overset but that's fine because we still need to zero
103 fill. */
104 VMOVU %VMM(0), (%rdi)
105
106 testl %ecx, %ecx
107 jnz L(zfill)
108
109 /* Align. */
110 addq %rsi, %rdx
111 subq %rsi, %rdi
112 orq $(VEC_SIZE - 1), %rsi
113 incq %rsi
114L(last_4x_vec):
115 addq %rsi, %rdi
116L(loop_last_4x_vec):
117 subq %rsi, %rdx
118
119
120 VMOVA 0(%rsi), %VMM(1)
121 VPCMPEQ %VMM(1), %VZERO, %VMM(6)
122 vpmovmskb %VMM(6), %ecx
123
124 cmpq $(VEC_SIZE * 2), %rdx
125 jae L(more_2x_vec)
126
127 cmpl $(VEC_SIZE), %edx
128 jb L(ret_vec_x1_len)
129
130 testl %ecx, %ecx
131 jnz L(ret_vec_x1)
132
133 VPCMPEQ VEC_SIZE(%rsi), %VZERO, %VMM(6)
134 VMOVU %VMM(1), (%rdi)
135 vpmovmskb %VMM(6), %ecx
136 shlq $VEC_SIZE, %rcx
137L(ret_vec_x1_len):
138 tzcntq %rcx, %rcx
139 cmpl %ecx, %edx
140 jbe L(ret_vec_x1_len_no_zfill)
141 /* Fall through (expectation) is copy len < buffer len. */
142 VMOVU %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
143L(ret_vec_x1_len_no_zfill_mov):
144 movl %ecx, %edx
145# ifdef USE_AS_STPCPY
146 /* clear flags. */
147 xorl %ecx, %ecx
148# endif
149L(ret_vec_x1_len_no_zfill):
150 VMOVU ((0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
151 VMOVU %VMM(1), ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
152# ifdef USE_AS_STPCPY
153# ifdef USE_AS_WCSCPY
154 setc %al
155 addq %rdx, %rdi
156 leaq (%rdi, %rax, CHAR_SIZE), %rax
157# else
158 movl %edx, %eax
159 adcq %rdi, %rax
160# endif
161# endif
162L(return_vzeroupper):
163 ZERO_UPPER_VEC_REGISTERS_RETURN
164
165 .p2align 4,, 6
166L(ret_vec_x1):
167 bsfl %ecx, %ecx
168 VMOVU %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
169 subl %ecx, %edx
170 /* Check if we need to reload/store. */
171 cmpl $VEC_SIZE, %edx
172 jb L(ret_vec_x1_len_no_zfill_mov)
173 /* Otherwise safe to just store directly. */
174 VMOVU %VMM(1), (%rdi)
175 VMOVU %VZERO, (%rdi, %rcx)
176# ifdef USE_AS_STPCPY
177 leaq (%rdi, %rcx), %rax
178# endif
179 VZEROUPPER_RETURN
180
181 .p2align 4,, 12
182L(more_2x_vec):
183 VMOVU %VMM(1), (%rdi)
184 testl %ecx, %ecx
185 /* Must fill at least 2x VEC. */
186 jnz L(zfill_vec1)
187
188 VMOVA VEC_SIZE(%rsi), %VMM(2)
189 VMOVU %VMM(2), VEC_SIZE(%rdi)
190 VPCMPEQ %VMM(2), %VZERO, %VMM(6)
191 vpmovmskb %VMM(6), %ecx
192 testl %ecx, %ecx
193 /* Must fill at least 1x VEC. */
194 jnz L(zfill_vec2)
195
196 VMOVA (VEC_SIZE * 2)(%rsi), %VMM(3)
197 VPCMPEQ %VMM(3), %VZERO, %VMM(6)
198 vpmovmskb %VMM(6), %ecx
199
200 /* Check if len is more 4x VEC. -CHAR_SIZE because rdx is len -
201 CHAR_SIZE. */
202 cmpq $(VEC_SIZE * 4 - CHAR_SIZE), %rdx
203 ja L(more_4x_vec)
204
205 subl $(VEC_SIZE * 3), %edx
206 jb L(ret_vec_x3_len)
207
208 testl %ecx, %ecx
209 jnz L(ret_vec_x3)
210
211 VPCMPEQ (VEC_SIZE * 3)(%rsi), %VZERO, %VMM(6)
212 VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
213 vpmovmskb %VMM(6), %ecx
214 tzcntl %ecx, %ecx
215 cmpl %ecx, %edx
216 jbe L(ret_vec_x4_len_no_zfill)
217 /* Fall through (expectation) is copy len < buffer len. */
218 VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
219 movl %ecx, %edx
220L(ret_vec_x4_len_no_zfill):
221 VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
222 VMOVU %VMM(1), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
223# ifdef USE_AS_STPCPY
224# ifdef USE_AS_WCSCPY
225 setc %al
226 addq %rdx, %rdi
227 leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
228# else
229 leal (VEC_SIZE * 3 + 0)(%edx), %eax
230 adcq %rdi, %rax
231# endif
232# endif
233 VZEROUPPER_RETURN
234
235
236L(ret_vec_x3_len):
237 addl $(VEC_SIZE * 1), %edx
238 tzcntl %ecx, %ecx
239 cmpl %ecx, %edx
240 jbe L(ret_vec_x3_len_no_zfill)
241 /* Fall through (expectation) is copy len < buffer len. */
242 VMOVU %VZERO, ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
243L(ret_vec_x3_len_no_zfill_mov):
244 movl %ecx, %edx
245# ifdef USE_AS_STPCPY
246 /* clear flags. */
247 xorl %ecx, %ecx
248# endif
249 .p2align 4,, 4
250L(ret_vec_x3_len_no_zfill):
251 VMOVU ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
252 VMOVU %VMM(1), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
253# ifdef USE_AS_STPCPY
254# ifdef USE_AS_WCSCPY
255 setc %al
256 addq %rdx, %rdi
257 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
258# else
259 leal (VEC_SIZE * 2 + 0)(%rdx), %eax
260 adcq %rdi, %rax
261# endif
262# endif
263 VZEROUPPER_RETURN
264
265
266 .p2align 4,, 8
267L(ret_vec_x3):
268 bsfl %ecx, %ecx
269 VMOVU %VZERO, (VEC_SIZE * 3 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx)
270 subl %ecx, %edx
271 jl L(ret_vec_x3_len_no_zfill_mov)
272 VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
273 VMOVU %VZERO, (VEC_SIZE * 2)(%rdi, %rcx)
274# ifdef USE_AS_STPCPY
275 leaq (VEC_SIZE * 2)(%rdi, %rcx), %rax
276# endif
277 VZEROUPPER_RETURN
278
279 .p2align 4,, 8
280L(more_4x_vec):
281
282 VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
283 testl %ecx, %ecx
284 jnz L(zfill_vec3)
285
286 VMOVA (VEC_SIZE * 3)(%rsi), %VMM(4)
287 VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi)
288 VPCMPEQ %VMM(4), %VZERO, %VMM(6)
289 vpmovmskb %VMM(6), %ecx
290 testl %ecx, %ecx
291 jnz L(zfill_vec4)
292
293 movq %rdx, %rcx
294 addq %rsi, %rdx
295 subq %rsi, %rdi
296 subq $-(VEC_SIZE * 4), %rsi
297 /* Recheck length before aligning. */
298 cmpq $(VEC_SIZE * 8 - CHAR_SIZE), %rcx
299 jbe L(last_4x_vec)
300
301 andq $(VEC_SIZE * -4), %rsi
302
303 /* Do first half of loop ahead of time so loop can just start by
304 storing. */
305 VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
306 VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
307 VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
308 VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
309
310 VPMIN %VMM(0), %VMM(1), %VMM(4)
311 VPMIN %VMM(2), %VMM(3), %VMM(6)
312 VPMIN %VMM(4), %VMM(6), %VMM(6)
313 VPCMPEQ %VMM(6), %VZERO, %VMM(6)
314 vpmovmskb %VMM(6), %r8d
315 addq %rsi, %rdi
316 testl %r8d, %r8d
317 jnz L(loop_4x_done)
318
319 /* Use r9 as end register. */
320 leaq -(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9
321
322 .p2align 4,, 11
323L(loop_4x_vec):
324
325 VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
326 VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
327 subq $(VEC_SIZE * -4), %rsi
328 VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
329 VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
330
331 subq $(VEC_SIZE * -4), %rdi
332 cmpq %rsi, %r9
333 jbe L(loop_last_4x_vec)
334
335 VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
336 VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
337 VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
338 VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
339
340 VPMIN %VMM(0), %VMM(1), %VMM(4)
341 VPMIN %VMM(2), %VMM(3), %VMM(6)
342 VPMIN %VMM(4), %VMM(6), %VMM(6)
343 VPCMPEQ %VMM(6), %VZERO, %VMM(6)
344
345 vpmovmskb %VMM(6), %r8d
346
347 testl %r8d, %r8d
348 jz L(loop_4x_vec)
349
350L(loop_4x_done):
351 subq %rsi, %rdx
352 VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
353 VPCMPEQ %VMM(0), %VZERO, %VMM(6)
354 vpmovmskb %VMM(6), %ecx
355 testl %ecx, %ecx
356 jnz L(zfill_vec1)
357
358 VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
359 VPCMPEQ %VMM(1), %VZERO, %VMM(6)
360 vpmovmskb %VMM(6), %ecx
361 testl %ecx, %ecx
362 jnz L(zfill_vec2)
363
364 VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
365 VPCMPEQ %VMM(2), %VZERO, %VMM(6)
366 vpmovmskb %VMM(6), %ecx
367 testl %ecx, %ecx
368 jnz L(zfill_vec3)
369
370 VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
371 movl %r8d, %ecx
372
373 // Zfill more....
374
375 .p2align 4,, 4
376L(zfill_vec4):
377 addq $(VEC_SIZE * 2), %rdi
378 subq $(VEC_SIZE * 2), %rdx
379L(zfill_vec2):
380 shlq $VEC_SIZE, %rcx
381L(zfill):
382 bsfq %rcx, %rcx
383 subq %rcx, %rdx
384 addq %rcx, %rdi
385# ifdef USE_AS_STPCPY
386 movq %rdi, %rax
387# endif
388L(zfill_from_page_cross):
389 cmpq $VEC_SIZE, %rdx
390 jb L(zfill_less_vec_vzeroupper)
391
392L(zfill_more_1x_vec):
393 VMOVU %VZERO, CHAR_SIZE(%rdi)
394 VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
395 cmpq $(VEC_SIZE * 2), %rdx
396 jae L(zfill_more_2x_vec)
397L(zfill_done0):
398 VZEROUPPER_RETURN
399
400 .p2align 4,, 8
401L(zfill_vec3):
402 addq $(VEC_SIZE * 2), %rdi
403 subq $(VEC_SIZE * 2), %rdx
404 .p2align 4,, 2
405L(zfill_vec1):
406 bsfl %ecx, %ecx
407 addq %rcx, %rdi
408 subq %rcx, %rdx
409# ifdef USE_AS_STPCPY
410 movq %rdi, %rax
411# endif
412 /* zfill from vec1/vec3 must have to set at least 2x VECS. */
413
414 VMOVU %VZERO, CHAR_SIZE(%rdi)
415 VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
416 cmpq $(VEC_SIZE * 2), %rdx
417 jb L(zfill_done0)
418L(zfill_more_2x_vec):
419 VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx)
420 VMOVU %VZERO, (VEC_SIZE + CHAR_SIZE)(%rdi)
421 subq $(VEC_SIZE * 4 - CHAR_SIZE), %rdx
422 jbe L(zfill_done)
423
424 addq %rdi, %rdx
425 VMOVU %VZERO, (VEC_SIZE * 2 + CHAR_SIZE)(%rdi)
426 VMOVU %VZERO, (VEC_SIZE * 3 + CHAR_SIZE)(%rdi)
427
428
429 VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
430 VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
431
432 subq $-(VEC_SIZE * 4 + CHAR_SIZE), %rdi
433 cmpq %rdi, %rdx
434 jbe L(zfill_done)
435
436 andq $-(VEC_SIZE), %rdi
437 .p2align 4,, 12
438L(zfill_loop_4x_vec):
439 VMOVA %VZERO, (VEC_SIZE * 0)(%rdi)
440 VMOVA %VZERO, (VEC_SIZE * 1)(%rdi)
441 VMOVA %VZERO, (VEC_SIZE * 2)(%rdi)
442 VMOVA %VZERO, (VEC_SIZE * 3)(%rdi)
443 subq $-(VEC_SIZE * 4), %rdi
444 cmpq %rdi, %rdx
445 ja L(zfill_loop_4x_vec)
446L(zfill_done):
447 VZEROUPPER_RETURN
448
449
450 .p2align 4,, 8
451L(copy_1x):
452 VMOVU %VMM(0), (%rdi)
453 testl %ecx, %ecx
454 jz L(ret_32_32)
455L(zfill_less_vec):
456 bsfl %ecx, %ecx
457L(zfill_less_vec_no_bsf):
458 subq %rcx, %rdx
459 addq %rcx, %rdi
460# ifdef USE_AS_STPCPY
461 movq %rdi, %rax
462# endif
463L(zfill_less_vec_vzeroupper):
464 COND_VZEROUPPER
465 /* We are taking advantage of the fact that to be here we must
466 be writing null-term as (%rdi, %rcx) we have a byte of lee-
467 way for overwriting. */
468 cmpl $16, %edx
469 jb L(zfill_less_16)
470 VMOVU %VZERO_128, (%rdi)
471 VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx)
472 ret
473# ifdef USE_AS_STPCPY
474L(ret_32_32):
475 leaq CHAR_SIZE(%rdi, %rdx), %rax
476 VZEROUPPER_RETURN
477# endif
478
479 .p2align 4,, 4
480L(copy_16_31):
481 /* Overfill to avoid branches. */
482 vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
483 vmovdqu %xmm0, (%rdi)
484 vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx)
485 cmpl %ecx, %edx
486 ja L(zfill_less_vec_no_bsf)
487# ifndef USE_AS_STPCPY
488L(ret_32_32):
489# else
490# ifdef USE_AS_WCSCPY
491 setc %al
492 addq %rdx, %rdi
493 leaq (%rdi, %rax, CHAR_SIZE), %rax
494# else
495 movl %edx, %eax
496 adcq %rdi, %rax
497# endif
498# endif
499 VZEROUPPER_RETURN
500
501 .p2align 4,, 4
502L(copy_8_15):
503 /* Overfill to avoid branches. */
504 movq -(8 - CHAR_SIZE)(%rsi, %rdx), %rsi
505 vmovq %xmm0, (%rdi)
506 movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx)
507 cmpl %ecx, %edx
508 jbe L(ret_8_15)
509 subq %rcx, %rdx
510 addq %rcx, %rdi
511# ifdef USE_AS_STPCPY
512 movq %rdi, %rax
513# endif
514 .p2align 4,, 8
515L(zfill_less_16):
516 xorl %ecx, %ecx
517 cmpl $8, %edx
518 jb L(zfill_less_8)
519 movq %rcx, (%rdi)
520 movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx)
521# ifndef USE_AS_STPCPY
522L(ret_8_15):
523# endif
524 ret
525
526
527 .p2align 4,, 8
528L(less_1x_vec):
529 /* Reuse flag from `cmp $VEC_SIZE, %rdx`. The idea is many
530 buffer sizes are aligned conventially. */
531 je L(copy_1x)
532
533 tzcntl %ecx, %ecx
534 cmpl $16, %edx
535 jae L(copy_16_31)
536
537 COND_VZEROUPPER
538 cmpl $8, %edx
539 jae L(copy_8_15)
540# ifdef USE_AS_WCSCPY
541 testl %ecx, %ecx
542 jz L(zfill_less_8_set_ret)
543
544 movl (%rsi, %rdx), %esi
545 vmovd %xmm0, (%rdi)
546 movl %esi, (%rdi, %rdx)
547
548# ifdef USE_AS_STPCPY
549 cmpl %ecx, %edx
550L(ret_8_15):
551 setc %al
552 addq %rdx, %rdi
553 leaq (%rdi, %rax, CHAR_SIZE), %rax
554# endif
555 ret
556L(zfill_less_8_set_ret):
557 xorl %ecx, %ecx
558# ifdef USE_AS_STPCPY
559 movq %rdi, %rax
560# endif
561L(zfill_less_8):
562 movl %ecx, (%rdi)
563 movl %ecx, (%rdi, %rdx)
564 ret
565
566# else
567 cmpl $3, %edx
568 jb L(copy_0_3)
569 /* Overfill to avoid branches. */
570 movl -3(%rsi, %rdx), %esi
571 vmovd %xmm0, (%rdi)
572 movl %esi, -3(%rdi, %rdx)
573 cmpl %ecx, %edx
574 jbe L(ret_4_7)
575 subq %rcx, %rdx
576 addq %rcx, %rdi
577# ifdef USE_AS_STPCPY
578 movq %rdi, %rax
579# endif
580 xorl %ecx, %ecx
581 .p2align 4,, 8
582L(zfill_less_8):
583 cmpl $3, %edx
584 jb L(zfill_less_3)
585 movl %ecx, (%rdi)
586 movl %ecx, -3(%rdi, %rdx)
587# ifdef USE_AS_STPCPY
588 ret
589# endif
590
591L(ret_4_7):
592# ifdef USE_AS_STPCPY
593L(ret_8_15):
594 movl %edx, %eax
595 adcq %rdi, %rax
596# endif
597 ret
598
599 .p2align 4,, 4
600L(zfill_less_3):
601 testl %edx, %edx
602 jz L(zfill_1)
603 movw %cx, (%rdi)
604L(zfill_1):
605 movb %cl, (%rdi, %rdx)
606 ret
607
608 .p2align 4,, 8
609L(copy_0_3):
610 vmovd %xmm0, %r8d
611 testl %edx, %edx
612 jz L(copy_1)
613 movw %r8w, (%rdi)
614 cmpl %ecx, %edx
615 ja L(zfill_from_1)
616 movzbl (%rsi, %rdx), %r8d
617# ifdef USE_AS_STPCPY
618 movl %edx, %eax
619 adcq %rdi, %rax
620 movb %r8b, (%rdi, %rdx)
621 ret
622# endif
623
624L(copy_1):
625# ifdef USE_AS_STPCPY
626 movl %edx, %eax
627 cmpl %ecx, %edx
628 adcq %rdi, %rax
629# endif
630# ifdef USE_AS_WCSCPY
631 vmovd %xmm0, (%rdi)
632# else
633 movb %r8b, (%rdi, %rdx)
634# endif
635 ret
636# endif
637
638 .p2align 4,, 2
639L(zero_len):
640 movq %rdi, %rax
641 ret
642# ifndef USE_AS_WCSCPY
643 .p2align 4,, 8
644L(zfill_from_1):
645# ifdef USE_AS_STPCPY
646 leaq (%rdi, %rcx), %rax
647# endif
648 movw $0, -1(%rdi, %rdx)
649 ret
650# endif
651
652 .p2align 4,, 4
653 .p2align 6,, 8
654L(page_cross):
655 movq %rsi, %rax
656 andq $(VEC_SIZE * -1), %rax
657
658 VPCMPEQ (%rax), %VZERO, %VMM(6)
659
660 vpmovmskb %VMM(6), %ecx
661 shrxl %esi, %ecx, %ecx
662
663 subl %esi, %eax
664 andl $(VEC_SIZE - 1), %eax
665 cmpq %rax, %rdx
666 jb L(page_cross_small)
667 /* Optimizing more aggressively for space as this is very cold
668 code. This saves 2x cache lines. */
669
670 /* If rcx is non-zero then continue. */
671 shl $CHAR_SIZE, %ecx
672 jz L(page_cross_continue)
673 bsf %ecx, %ecx
674
675 subq %rcx, %rdx
676# ifdef USE_AS_STPCPY
677 leaq -CHAR_SIZE(%rdi, %rcx), %rax
678# else
679 movq %rdi, %rax
680# endif
681
682 rep movsb
683# ifdef USE_AS_WCSCPY
684 movl $0, (%rdi)
685# else
686 movb $0, (%rdi)
687# endif
688 jmp L(zfill_from_page_cross)
689
690L(page_cross_small):
691 tzcntl %ecx, %ecx
692 xorl %eax, %eax
693 cmpl %ecx, %edx
694 jbe L(page_cross_copy_only)
695
696 /* Do a zfill of the tail before copying. */
697 movq %rdi, %r9
698 movl %ecx, %r8d
699
700 subl %ecx, %edx
701 leaq CHAR_SIZE(%rdi, %rcx), %rdi
702 movl %edx, %ecx
703 rep stosb
704 movq %r9, %rdi
705 movl %r8d, %edx
706L(page_cross_copy_only):
707 leal CHAR_SIZE(%rdx), %ecx
708# ifdef USE_AS_STPCPY
709# ifdef USE_AS_WCSCPY
710 setc %al
711 addq %rdi, %rdx
712 leaq (%rdx, %rax, CHAR_SIZE), %rax
713# else
714 movl %edx, %eax
715 adcq %rdi, %rax
716# endif
717# else
718 movq %rdi, %rax
719# endif
720 rep movsb
721 ret
722
723
724L(best_effort_strncpy):
725 movq %rdx, %rcx
726 xorl %eax, %eax
727 movq %rdi, %r8
728 /* The length is >= 2^63. We very much so expect to segfault at
729 rep stos. If that doesn't happen then just strcpy to finish.
730 */
731# ifdef USE_AS_WCSCPY
732 rep stosl
733# else
734 rep stosb
735# endif
736 movq %r8, %rdi
737 jmp OVERFLOW_STRCPY
738END(STRNCPY)
739#endif
740

source code of glibc/sysdeps/x86_64/multiarch/strncpy-avx2.S