strncpy-avx2.S source code [glibc/sysdeps/x86_64/multiarch/strncpy-avx2.S]

1	/ strncpy with AVX2*
2	Copyright (C) 2022-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20
21	#if ISA_SHOULD_BUILD (3)
22
23	# include <sysdep.h>
24
25
26	# ifndef VEC_SIZE
27	# include "x86-avx-vecs.h"
28	# endif
29
30	# ifndef STRNCPY
31	# define STRNCPY __strncpy_avx2
32	# endif
33
34
35	# ifdef USE_AS_WCSCPY
36	# define VPCMPEQ vpcmpeqd
37	# define VPMIN vpminud
38	# define CHAR_SIZE 4
39	# else
40	# define VPCMPEQ vpcmpeqb
41	# define VPMIN vpminub
42	# define CHAR_SIZE 1
43	# endif
44
45	# include "strncpy-or-cat-overflow-def.h"
46
47	# define PAGE_SIZE 4096
48
49	# define VZERO VMM(7)
50	# define VZERO_128 VMM_128(7)
51
52
53	.section SECTION(.text), "ax", @progbits
54	ENTRY(STRNCPY)
55	# ifdef __ILP32__
56	/ Clear the upper 32 bits. /
57	movl %edx, %edx
58	# endif
59	/ Filter zero length strings and very long strings. Zero*
60	length strings just return, very long strings are handled by
61	just running rep stos{b\|l} to zero set (which will almost
62	certainly segfault), if that succeeds then just calling
63	OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). /*
64	# ifdef USE_AS_WCSCPY
65	decq %rdx
66	movq %rdx, %rax
67	/ 56 is end of max supported address space. /
68	shr $`56`, %rax
69	jnz L(zero_len)
70	salq $`2`, %rdx
71	# else
72	decq %rdx
73	/ `dec` can macrofuse with `jl`. If the flag needs to become*
74	`jb` replace `dec` with `sub`. /*
75	jl L(zero_len)
76	# endif
77
78	vpxor %VZERO_128, %VZERO_128, %VZERO_128
79	movl %esi, %eax
80	andl $(PAGE_SIZE - `1`), %eax
81	cmpl $(PAGE_SIZE - VEC_SIZE), %eax
82	ja L(page_cross)
83
84	L(page_cross_continue):
85	VMOVU (%rsi), %VMM(`0`)
86	VPCMPEQ %VMM(`0`), %VZERO, %VMM(`6`)
87	vpmovmskb %VMM(`6`), %ecx
88
89	/ If no STPCPY just save end ahead of time. /
90	# ifndef USE_AS_STPCPY
91	movq %rdi, %rax
92	# elif defined USE_AS_WCSCPY
93	/ Clear dependency as nearly all return code for wcpncpy uses*
94	`setc %al`. /*
95	xorl %eax, %eax
96	# endif
97
98	cmpq $(VEC_SIZE - CHAR_SIZE), %rdx
99	/ `jb` because length rdx is now length - CHAR_SIZE. /
100	jbe L(less_1x_vec)
101
102	/ This may overset but that's fine because we still need to zero*
103	fill. /*
104	VMOVU %VMM(`0`), (%rdi)
105
106	testl %ecx, %ecx
107	jnz L(zfill)
108
109	/ Align. /
110	addq %rsi, %rdx
111	subq %rsi, %rdi
112	orq $(VEC_SIZE - `1`), %rsi
113	incq %rsi
114	L(last_4x_vec):
115	addq %rsi, %rdi
116	L(loop_last_4x_vec):
117	subq %rsi, %rdx
118
119
120	VMOVA `0`(%rsi), %VMM(`1`)
121	VPCMPEQ %VMM(`1`), %VZERO, %VMM(`6`)
122	vpmovmskb %VMM(`6`), %ecx
123
124	cmpq $(VEC_SIZE * `2`), %rdx
125	jae L(more_2x_vec)
126
127	cmpl $(VEC_SIZE), %edx
128	jb L(ret_vec_x1_len)
129
130	testl %ecx, %ecx
131	jnz L(ret_vec_x1)
132
133	VPCMPEQ VEC_SIZE(%rsi), %VZERO, %VMM(`6`)
134	VMOVU %VMM(`1`), (%rdi)
135	vpmovmskb %VMM(`6`), %ecx
136	shlq $VEC_SIZE, %rcx
137	L(ret_vec_x1_len):
138	tzcntq %rcx, %rcx
139	cmpl %ecx, %edx
140	jbe L(ret_vec_x1_len_no_zfill)
141	/ Fall through (expectation) is copy len < buffer len. /
142	VMOVU %VZERO, ((`0`)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
143	L(ret_vec_x1_len_no_zfill_mov):
144	movl %ecx, %edx
145	# ifdef USE_AS_STPCPY
146	/ clear flags. /
147	xorl %ecx, %ecx
148	# endif
149	L(ret_vec_x1_len_no_zfill):
150	VMOVU ((`0`)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(`1`)
151	VMOVU %VMM(`1`), ((`0`)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
152	# ifdef USE_AS_STPCPY
153	# ifdef USE_AS_WCSCPY
154	setc %al
155	addq %rdx, %rdi
156	leaq (%rdi, %rax, CHAR_SIZE), %rax
157	# else
158	movl %edx, %eax
159	adcq %rdi, %rax
160	# endif
161	# endif
162	L(return_vzeroupper):
163	ZERO_UPPER_VEC_REGISTERS_RETURN
164
165	.p2align `4`,, `6`
166	L(ret_vec_x1):
167	bsfl %ecx, %ecx
168	VMOVU %VZERO, ((`0`)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
169	subl %ecx, %edx
170	/ Check if we need to reload/store. /
171	cmpl $VEC_SIZE, %edx
172	jb L(ret_vec_x1_len_no_zfill_mov)
173	/ Otherwise safe to just store directly. /
174	VMOVU %VMM(`1`), (%rdi)
175	VMOVU %VZERO, (%rdi, %rcx)
176	# ifdef USE_AS_STPCPY
177	leaq (%rdi, %rcx), %rax
178	# endif
179	VZEROUPPER_RETURN
180
181	.p2align `4`,, `12`
182	L(more_2x_vec):
183	VMOVU %VMM(`1`), (%rdi)
184	testl %ecx, %ecx
185	/ Must fill at least 2x VEC. /
186	jnz L(zfill_vec1)
187
188	VMOVA VEC_SIZE(%rsi), %VMM(`2`)
189	VMOVU %VMM(`2`), VEC_SIZE(%rdi)
190	VPCMPEQ %VMM(`2`), %VZERO, %VMM(`6`)
191	vpmovmskb %VMM(`6`), %ecx
192	testl %ecx, %ecx
193	/ Must fill at least 1x VEC. /
194	jnz L(zfill_vec2)
195
196	VMOVA (VEC_SIZE * `2`)(%rsi), %VMM(`3`)
197	VPCMPEQ %VMM(`3`), %VZERO, %VMM(`6`)
198	vpmovmskb %VMM(`6`), %ecx
199
200	/ Check if len is more 4x VEC. -CHAR_SIZE because rdx is len -*
201	CHAR_SIZE. /*
202	cmpq $(VEC_SIZE * `4` - CHAR_SIZE), %rdx
203	ja L(more_4x_vec)
204
205	subl $(VEC_SIZE * `3`), %edx
206	jb L(ret_vec_x3_len)
207
208	testl %ecx, %ecx
209	jnz L(ret_vec_x3)
210
211	VPCMPEQ (VEC_SIZE * `3`)(%rsi), %VZERO, %VMM(`6`)
212	VMOVU %VMM(`3`), (VEC_SIZE * `2`)(%rdi)
213	vpmovmskb %VMM(`6`), %ecx
214	tzcntl %ecx, %ecx
215	cmpl %ecx, %edx
216	jbe L(ret_vec_x4_len_no_zfill)
217	/ Fall through (expectation) is copy len < buffer len. /
218	VMOVU %VZERO, ((VEC_SIZE * `3`)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
219	movl %ecx, %edx
220	L(ret_vec_x4_len_no_zfill):
221	VMOVU ((VEC_SIZE * `3`)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(`1`)
222	VMOVU %VMM(`1`), ((VEC_SIZE * `3`)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
223	# ifdef USE_AS_STPCPY
224	# ifdef USE_AS_WCSCPY
225	setc %al
226	addq %rdx, %rdi
227	leaq (VEC_SIZE * `3`)(%rdi, %rax, CHAR_SIZE), %rax
228	# else
229	leal (VEC_SIZE * `3` + `0`)(%edx), %eax
230	adcq %rdi, %rax
231	# endif
232	# endif
233	VZEROUPPER_RETURN
234
235
236	L(ret_vec_x3_len):
237	addl $(VEC_SIZE * `1`), %edx
238	tzcntl %ecx, %ecx
239	cmpl %ecx, %edx
240	jbe L(ret_vec_x3_len_no_zfill)
241	/ Fall through (expectation) is copy len < buffer len. /
242	VMOVU %VZERO, ((VEC_SIZE * `2`)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
243	L(ret_vec_x3_len_no_zfill_mov):
244	movl %ecx, %edx
245	# ifdef USE_AS_STPCPY
246	/ clear flags. /
247	xorl %ecx, %ecx
248	# endif
249	.p2align `4`,, `4`
250	L(ret_vec_x3_len_no_zfill):
251	VMOVU ((VEC_SIZE * `2`)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(`1`)
252	VMOVU %VMM(`1`), ((VEC_SIZE * `2`)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
253	# ifdef USE_AS_STPCPY
254	# ifdef USE_AS_WCSCPY
255	setc %al
256	addq %rdx, %rdi
257	leaq (VEC_SIZE * `2`)(%rdi, %rax, CHAR_SIZE), %rax
258	# else
259	leal (VEC_SIZE * `2` + `0`)(%rdx), %eax
260	adcq %rdi, %rax
261	# endif
262	# endif
263	VZEROUPPER_RETURN
264
265
266	.p2align `4`,, `8`
267	L(ret_vec_x3):
268	bsfl %ecx, %ecx
269	VMOVU %VZERO, (VEC_SIZE * `3` +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx)
270	subl %ecx, %edx
271	jl L(ret_vec_x3_len_no_zfill_mov)
272	VMOVU %VMM(`3`), (VEC_SIZE * `2`)(%rdi)
273	VMOVU %VZERO, (VEC_SIZE * `2`)(%rdi, %rcx)
274	# ifdef USE_AS_STPCPY
275	leaq (VEC_SIZE * `2`)(%rdi, %rcx), %rax
276	# endif
277	VZEROUPPER_RETURN
278
279	.p2align `4`,, `8`
280	L(more_4x_vec):
281
282	VMOVU %VMM(`3`), (VEC_SIZE * `2`)(%rdi)
283	testl %ecx, %ecx
284	jnz L(zfill_vec3)
285
286	VMOVA (VEC_SIZE * `3`)(%rsi), %VMM(`4`)
287	VMOVU %VMM(`4`), (VEC_SIZE * `3`)(%rdi)
288	VPCMPEQ %VMM(`4`), %VZERO, %VMM(`6`)
289	vpmovmskb %VMM(`6`), %ecx
290	testl %ecx, %ecx
291	jnz L(zfill_vec4)
292
293	movq %rdx, %rcx
294	addq %rsi, %rdx
295	subq %rsi, %rdi
296	subq $-(VEC_SIZE * `4`), %rsi
297	/ Recheck length before aligning. /
298	cmpq $(VEC_SIZE * `8` - CHAR_SIZE), %rcx
299	jbe L(last_4x_vec)
300
301	andq $(VEC_SIZE * -`4`), %rsi
302
303	/ Do first half of loop ahead of time so loop can just start by*
304	storing. /*
305	VMOVA (VEC_SIZE * `0` + `0`)(%rsi), %VMM(`0`)
306	VMOVA (VEC_SIZE * `1` + `0`)(%rsi), %VMM(`1`)
307	VMOVA (VEC_SIZE * `2` + `0`)(%rsi), %VMM(`2`)
308	VMOVA (VEC_SIZE * `3` + `0`)(%rsi), %VMM(`3`)
309
310	VPMIN %VMM(`0`), %VMM(`1`), %VMM(`4`)
311	VPMIN %VMM(`2`), %VMM(`3`), %VMM(`6`)
312	VPMIN %VMM(`4`), %VMM(`6`), %VMM(`6`)
313	VPCMPEQ %VMM(`6`), %VZERO, %VMM(`6`)
314	vpmovmskb %VMM(`6`), %r8d
315	addq %rsi, %rdi
316	testl %r8d, %r8d
317	jnz L(loop_4x_done)
318
319	/ Use r9 as end register. /
320	leaq -(VEC_SIZE * `4` - CHAR_SIZE)(%rdx), %r9
321
322	.p2align `4`,, `11`
323	L(loop_4x_vec):
324
325	VMOVU %VMM(`0`), (VEC_SIZE * `0` + `0`)(%rdi)
326	VMOVU %VMM(`1`), (VEC_SIZE * `1` + `0`)(%rdi)
327	subq $(VEC_SIZE * -`4`), %rsi
328	VMOVU %VMM(`2`), (VEC_SIZE * `2` + `0`)(%rdi)
329	VMOVU %VMM(`3`), (VEC_SIZE * `3` + `0`)(%rdi)
330
331	subq $(VEC_SIZE * -`4`), %rdi
332	cmpq %rsi, %r9
333	jbe L(loop_last_4x_vec)
334
335	VMOVA (VEC_SIZE * `0` + `0`)(%rsi), %VMM(`0`)
336	VMOVA (VEC_SIZE * `1` + `0`)(%rsi), %VMM(`1`)
337	VMOVA (VEC_SIZE * `2` + `0`)(%rsi), %VMM(`2`)
338	VMOVA (VEC_SIZE * `3` + `0`)(%rsi), %VMM(`3`)
339
340	VPMIN %VMM(`0`), %VMM(`1`), %VMM(`4`)
341	VPMIN %VMM(`2`), %VMM(`3`), %VMM(`6`)
342	VPMIN %VMM(`4`), %VMM(`6`), %VMM(`6`)
343	VPCMPEQ %VMM(`6`), %VZERO, %VMM(`6`)
344
345	vpmovmskb %VMM(`6`), %r8d
346
347	testl %r8d, %r8d
348	jz L(loop_4x_vec)
349
350	L(loop_4x_done):
351	subq %rsi, %rdx
352	VMOVU %VMM(`0`), (VEC_SIZE * `0` + `0`)(%rdi)
353	VPCMPEQ %VMM(`0`), %VZERO, %VMM(`6`)
354	vpmovmskb %VMM(`6`), %ecx
355	testl %ecx, %ecx
356	jnz L(zfill_vec1)
357
358	VMOVU %VMM(`1`), (VEC_SIZE * `1` + `0`)(%rdi)
359	VPCMPEQ %VMM(`1`), %VZERO, %VMM(`6`)
360	vpmovmskb %VMM(`6`), %ecx
361	testl %ecx, %ecx
362	jnz L(zfill_vec2)
363
364	VMOVU %VMM(`2`), (VEC_SIZE * `2` + `0`)(%rdi)
365	VPCMPEQ %VMM(`2`), %VZERO, %VMM(`6`)
366	vpmovmskb %VMM(`6`), %ecx
367	testl %ecx, %ecx
368	jnz L(zfill_vec3)
369
370	VMOVU %VMM(`3`), (VEC_SIZE * `3` + `0`)(%rdi)
371	movl %r8d, %ecx
372
373	// Zfill more....
374
375	.p2align `4`,, `4`
376	L(zfill_vec4):
377	addq $(VEC_SIZE * `2`), %rdi
378	subq $(VEC_SIZE * `2`), %rdx
379	L(zfill_vec2):
380	shlq $VEC_SIZE, %rcx
381	L(zfill):
382	bsfq %rcx, %rcx
383	subq %rcx, %rdx
384	addq %rcx, %rdi
385	# ifdef USE_AS_STPCPY
386	movq %rdi, %rax
387	# endif
388	L(zfill_from_page_cross):
389	cmpq $VEC_SIZE, %rdx
390	jb L(zfill_less_vec_vzeroupper)
391
392	L(zfill_more_1x_vec):
393	VMOVU %VZERO, CHAR_SIZE(%rdi)
394	VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
395	cmpq $(VEC_SIZE * `2`), %rdx
396	jae L(zfill_more_2x_vec)
397	L(zfill_done0):
398	VZEROUPPER_RETURN
399
400	.p2align `4`,, `8`
401	L(zfill_vec3):
402	addq $(VEC_SIZE * `2`), %rdi
403	subq $(VEC_SIZE * `2`), %rdx
404	.p2align `4`,, `2`
405	L(zfill_vec1):
406	bsfl %ecx, %ecx
407	addq %rcx, %rdi
408	subq %rcx, %rdx
409	# ifdef USE_AS_STPCPY
410	movq %rdi, %rax
411	# endif
412	/ zfill from vec1/vec3 must have to set at least 2x VECS. /
413
414	VMOVU %VZERO, CHAR_SIZE(%rdi)
415	VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
416	cmpq $(VEC_SIZE * `2`), %rdx
417	jb L(zfill_done0)
418	L(zfill_more_2x_vec):
419	VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * `2`)(%rdi, %rdx)
420	VMOVU %VZERO, (VEC_SIZE + CHAR_SIZE)(%rdi)
421	subq $(VEC_SIZE * `4` - CHAR_SIZE), %rdx
422	jbe L(zfill_done)
423
424	addq %rdi, %rdx
425	VMOVU %VZERO, (VEC_SIZE * `2` + CHAR_SIZE)(%rdi)
426	VMOVU %VZERO, (VEC_SIZE * `3` + CHAR_SIZE)(%rdi)
427
428
429	VMOVU %VZERO, (VEC_SIZE * `0` + `0`)(%rdx)
430	VMOVU %VZERO, (VEC_SIZE * `1` + `0`)(%rdx)
431
432	subq $-(VEC_SIZE * `4` + CHAR_SIZE), %rdi
433	cmpq %rdi, %rdx
434	jbe L(zfill_done)
435
436	andq $-(VEC_SIZE), %rdi
437	.p2align `4`,, `12`
438	L(zfill_loop_4x_vec):
439	VMOVA %VZERO, (VEC_SIZE * `0`)(%rdi)
440	VMOVA %VZERO, (VEC_SIZE * `1`)(%rdi)
441	VMOVA %VZERO, (VEC_SIZE * `2`)(%rdi)
442	VMOVA %VZERO, (VEC_SIZE * `3`)(%rdi)
443	subq $-(VEC_SIZE * `4`), %rdi
444	cmpq %rdi, %rdx
445	ja L(zfill_loop_4x_vec)
446	L(zfill_done):
447	VZEROUPPER_RETURN
448
449
450	.p2align `4`,, `8`
451	L(copy_1x):
452	VMOVU %VMM(`0`), (%rdi)
453	testl %ecx, %ecx
454	jz L(ret_32_32)
455	L(zfill_less_vec):
456	bsfl %ecx, %ecx
457	L(zfill_less_vec_no_bsf):
458	subq %rcx, %rdx
459	addq %rcx, %rdi
460	# ifdef USE_AS_STPCPY
461	movq %rdi, %rax
462	# endif
463	L(zfill_less_vec_vzeroupper):
464	COND_VZEROUPPER
465	/ We are taking advantage of the fact that to be here we must*
466	be writing null-term as (%rdi, %rcx) we have a byte of lee-
467	way for overwriting. /*
468	cmpl $`16`, %edx
469	jb L(zfill_less_16)
470	VMOVU %VZERO_128, (%rdi)
471	VMOVU %VZERO_128, -(`16` - CHAR_SIZE)(%rdi, %rdx)
472	ret
473	# ifdef USE_AS_STPCPY
474	L(ret_32_32):
475	leaq CHAR_SIZE(%rdi, %rdx), %rax
476	VZEROUPPER_RETURN
477	# endif
478
479	.p2align `4`,, `4`
480	L(copy_16_31):
481	/ Overfill to avoid branches. /
482	vmovdqu -(`16` - CHAR_SIZE)(%rsi, %rdx), %xmm1
483	vmovdqu %xmm0, (%rdi)
484	vmovdqu %xmm1, -(`16` - CHAR_SIZE)(%rdi, %rdx)
485	cmpl %ecx, %edx
486	ja L(zfill_less_vec_no_bsf)
487	# ifndef USE_AS_STPCPY
488	L(ret_32_32):
489	# else
490	# ifdef USE_AS_WCSCPY
491	setc %al
492	addq %rdx, %rdi
493	leaq (%rdi, %rax, CHAR_SIZE), %rax
494	# else
495	movl %edx, %eax
496	adcq %rdi, %rax
497	# endif
498	# endif
499	VZEROUPPER_RETURN
500
501	.p2align `4`,, `4`
502	L(copy_8_15):
503	/ Overfill to avoid branches. /
504	movq -(`8` - CHAR_SIZE)(%rsi, %rdx), %rsi
505	vmovq %xmm0, (%rdi)
506	movq %rsi, -(`8` - CHAR_SIZE)(%rdi, %rdx)
507	cmpl %ecx, %edx
508	jbe L(ret_8_15)
509	subq %rcx, %rdx
510	addq %rcx, %rdi
511	# ifdef USE_AS_STPCPY
512	movq %rdi, %rax
513	# endif
514	.p2align `4`,, `8`
515	L(zfill_less_16):
516	xorl %ecx, %ecx
517	cmpl $`8`, %edx
518	jb L(zfill_less_8)
519	movq %rcx, (%rdi)
520	movq %rcx, -(`8` - CHAR_SIZE)(%rdi, %rdx)
521	# ifndef USE_AS_STPCPY
522	L(ret_8_15):
523	# endif
524	ret
525
526
527	.p2align `4`,, `8`
528	L(less_1x_vec):
529	/ Reuse flag from `cmp $VEC_SIZE, %rdx`. The idea is many*
530	buffer sizes are aligned conventially. /*
531	je L(copy_1x)
532
533	tzcntl %ecx, %ecx
534	cmpl $`16`, %edx
535	jae L(copy_16_31)
536
537	COND_VZEROUPPER
538	cmpl $`8`, %edx
539	jae L(copy_8_15)
540	# ifdef USE_AS_WCSCPY
541	testl %ecx, %ecx
542	jz L(zfill_less_8_set_ret)
543
544	movl (%rsi, %rdx), %esi
545	vmovd %xmm0, (%rdi)
546	movl %esi, (%rdi, %rdx)
547
548	# ifdef USE_AS_STPCPY
549	cmpl %ecx, %edx
550	L(ret_8_15):
551	setc %al
552	addq %rdx, %rdi
553	leaq (%rdi, %rax, CHAR_SIZE), %rax
554	# endif
555	ret
556	L(zfill_less_8_set_ret):
557	xorl %ecx, %ecx
558	# ifdef USE_AS_STPCPY
559	movq %rdi, %rax
560	# endif
561	L(zfill_less_8):
562	movl %ecx, (%rdi)
563	movl %ecx, (%rdi, %rdx)
564	ret
565
566	# else
567	cmpl $`3`, %edx
568	jb L(copy_0_3)
569	/ Overfill to avoid branches. /
570	movl -`3`(%rsi, %rdx), %esi
571	vmovd %xmm0, (%rdi)
572	movl %esi, -`3`(%rdi, %rdx)
573	cmpl %ecx, %edx
574	jbe L(ret_4_7)
575	subq %rcx, %rdx
576	addq %rcx, %rdi
577	# ifdef USE_AS_STPCPY
578	movq %rdi, %rax
579	# endif
580	xorl %ecx, %ecx
581	.p2align `4`,, `8`
582	L(zfill_less_8):
583	cmpl $`3`, %edx
584	jb L(zfill_less_3)
585	movl %ecx, (%rdi)
586	movl %ecx, -`3`(%rdi, %rdx)
587	# ifdef USE_AS_STPCPY
588	ret
589	# endif
590
591	L(ret_4_7):
592	# ifdef USE_AS_STPCPY
593	L(ret_8_15):
594	movl %edx, %eax
595	adcq %rdi, %rax
596	# endif
597	ret
598
599	.p2align `4`,, `4`
600	L(zfill_less_3):
601	testl %edx, %edx
602	jz L(zfill_1)
603	movw %cx, (%rdi)
604	L(zfill_1):
605	movb %cl, (%rdi, %rdx)
606	ret
607
608	.p2align `4`,, `8`
609	L(copy_0_3):
610	vmovd %xmm0, %r8d
611	testl %edx, %edx
612	jz L(copy_1)
613	movw %r8w, (%rdi)
614	cmpl %ecx, %edx
615	ja L(zfill_from_1)
616	movzbl (%rsi, %rdx), %r8d
617	# ifdef USE_AS_STPCPY
618	movl %edx, %eax
619	adcq %rdi, %rax
620	movb %r8b, (%rdi, %rdx)
621	ret
622	# endif
623
624	L(copy_1):
625	# ifdef USE_AS_STPCPY
626	movl %edx, %eax
627	cmpl %ecx, %edx
628	adcq %rdi, %rax
629	# endif
630	# ifdef USE_AS_WCSCPY
631	vmovd %xmm0, (%rdi)
632	# else
633	movb %r8b, (%rdi, %rdx)
634	# endif
635	ret
636	# endif
637
638	.p2align `4`,, `2`
639	L(zero_len):
640	movq %rdi, %rax
641	ret
642	# ifndef USE_AS_WCSCPY
643	.p2align `4`,, `8`
644	L(zfill_from_1):
645	# ifdef USE_AS_STPCPY
646	leaq (%rdi, %rcx), %rax
647	# endif
648	movw $`0`, -`1`(%rdi, %rdx)
649	ret
650	# endif
651
652	.p2align `4`,, `4`
653	.p2align `6`,, `8`
654	L(page_cross):
655	movq %rsi, %rax
656	andq $(VEC_SIZE * -`1`), %rax
657
658	VPCMPEQ (%rax), %VZERO, %VMM(`6`)
659
660	vpmovmskb %VMM(`6`), %ecx
661	shrxl %esi, %ecx, %ecx
662
663	subl %esi, %eax
664	andl $(VEC_SIZE - `1`), %eax
665	cmpq %rax, %rdx
666	jb L(page_cross_small)
667	/ Optimizing more aggressively for space as this is very cold*
668	code. This saves 2x cache lines. /*
669
670	/ If rcx is non-zero then continue. /
671	shl $CHAR_SIZE, %ecx
672	jz L(page_cross_continue)
673	bsf %ecx, %ecx
674
675	subq %rcx, %rdx
676	# ifdef USE_AS_STPCPY
677	leaq -CHAR_SIZE(%rdi, %rcx), %rax
678	# else
679	movq %rdi, %rax
680	# endif
681
682	rep movsb
683	# ifdef USE_AS_WCSCPY
684	movl $`0`, (%rdi)
685	# else
686	movb $`0`, (%rdi)
687	# endif
688	jmp L(zfill_from_page_cross)
689
690	L(page_cross_small):
691	tzcntl %ecx, %ecx
692	xorl %eax, %eax
693	cmpl %ecx, %edx
694	jbe L(page_cross_copy_only)
695
696	/ Do a zfill of the tail before copying. /
697	movq %rdi, %r9
698	movl %ecx, %r8d
699
700	subl %ecx, %edx
701	leaq CHAR_SIZE(%rdi, %rcx), %rdi
702	movl %edx, %ecx
703	rep stosb
704	movq %r9, %rdi
705	movl %r8d, %edx
706	L(page_cross_copy_only):
707	leal CHAR_SIZE(%rdx), %ecx
708	# ifdef USE_AS_STPCPY
709	# ifdef USE_AS_WCSCPY
710	setc %al
711	addq %rdi, %rdx
712	leaq (%rdx, %rax, CHAR_SIZE), %rax
713	# else
714	movl %edx, %eax
715	adcq %rdi, %rax
716	# endif
717	# else
718	movq %rdi, %rax
719	# endif
720	rep movsb
721	ret
722
723
724	L(best_effort_strncpy):
725	movq %rdx, %rcx
726	xorl %eax, %eax
727	movq %rdi, %r8
728	/ The length is >= 2^63. We very much so expect to segfault at*
729	rep stos. If that doesn't happen then just strcpy to finish.
730	*/
731	# ifdef USE_AS_WCSCPY
732	rep stosl
733	# else
734	rep stosb
735	# endif
736	movq %r8, %rdi
737	jmp OVERFLOW_STRCPY
738	END(STRNCPY)
739	#endif
740

source code of glibc/sysdeps/x86_64/multiarch/strncpy-avx2.S