strcpy-avx2.S source code [glibc/sysdeps/x86_64/multiarch/strcpy-avx2.S]

1	/ strcpy with AVX2*
2	Copyright (C) 2011-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20
21	#if ISA_SHOULD_BUILD (3)
22
23	# include <sysdep.h>
24
25	# ifndef VEC_SIZE
26	# include "x86-avx-vecs.h"
27	# endif
28
29	# ifndef STRCPY
30	# define STRCPY __strcpy_avx2
31	# endif
32
33	/ Use movsb in page cross case to save code size. /
34	# define USE_MOVSB_IN_PAGE_CROSS 1
35
36	# ifdef USE_AS_WCSCPY
37	# define VPCMPEQ vpcmpeqd
38	# define VPMIN vpminud
39	# define CHAR_SIZE 4
40	# else
41	# define VPCMPEQ vpcmpeqb
42	# define VPMIN vpminub
43	# define CHAR_SIZE 1
44	# endif
45
46	# define PAGE_SIZE 4096
47
48	# ifdef USE_AS_STPCPY
49	# define END_REG rax
50	# else
51	# define END_REG rdi, %rdx
52	# endif
53
54	# ifdef USE_AS_STRCAT
55	# define PAGE_ALIGN_REG ecx
56	# else
57	# define PAGE_ALIGN_REG eax
58	# endif
59
60	# define VZERO VMM(7)
61	# define VZERO_128 VMM_128(7)
62
63	.section SECTION(.text), "ax", @progbits
64	ENTRY(STRCPY)
65	vpxor %VZERO_128, %VZERO_128, %VZERO_128
66
67	# ifdef USE_AS_STRCAT
68	movq %rdi, %rax
69	# include "strcat-strlen-avx2.h.S"
70	# endif
71
72	movl %esi, %PAGE_ALIGN_REG
73	andl $(PAGE_SIZE - `1`), %PAGE_ALIGN_REG
74	cmpl $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
75	ja L(page_cross)
76	L(page_cross_continue):
77	# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
78	movq %rdi, %rax
79	# endif
80	VMOVU (%rsi), %VMM(`0`)
81	VPCMPEQ %VMM(`0`), %VZERO, %VMM(`6`)
82	vpmovmskb %VMM(`6`), %ecx
83
84	testl %ecx, %ecx
85	jz L(more_1x_vec)
86
87	/ No longer need ymm registers so just vzeroupper so it doesn't*
88	need to be duplicated at each return statement. /*
89	COND_VZEROUPPER
90
91	xorl %edx, %edx
92	bsfl %ecx, %edx
93	# ifdef USE_AS_STPCPY
94	leaq (%rdi, %rdx), %rax
95	# endif
96
97	/ Use mask bits in rcx to detect which copy we need. If the low*
98	mask is zero then there must be a bit set in the upper half.
99	I.e if ecx != 0 and cx == 0, then match must be upper 16
100	bits so we use L(copy_16_31). /*
101	testw %cx, %cx
102	jz L(copy_16_31)
103
104	testb %cl, %cl
105	jz L(copy_8_15)
106	# ifdef USE_AS_WCSCPY
107	vmovd %xmm0, (%rdi)
108	movl $`0`, (%END_REG)
109	ret
110	# else
111	testb $`0x7`, %cl
112	jz L(copy_4_7)
113
114	testl %edx, %edx
115	jz L(set_null_term)
116	vmovd %xmm0, %ecx
117	movw %cx, (%rdi)
118
119	.p2align `4`,, `2`
120	L(set_null_term):
121	movb $`0`, (%END_REG)
122	ret
123
124	.p2align `4`,, `12`
125	L(copy_4_7):
126	movl -`3`(%rsi, %rdx), %ecx
127	vmovd %xmm0, (%rdi)
128	movl %ecx, -`3`(%END_REG)
129	ret
130	# endif
131
132	.p2align `4`,, `10`
133	L(copy_16_31):
134	VMOVU -(`16` - CHAR_SIZE)(%rsi, %rdx), %xmm1
135	VMOVU %xmm0, (%rdi)
136	VMOVU %xmm1, -(`16` - CHAR_SIZE)(%END_REG)
137	ret
138
139	.p2align `4`,, `10`
140	L(copy_8_15):
141	# ifdef USE_AS_WCSCPY
142	movl -(`8` - CHAR_SIZE)(%rsi, %rdx), %ecx
143	# else
144	movq -(`8` - CHAR_SIZE)(%rsi, %rdx), %rcx
145	# endif
146	vmovq %xmm0, (%rdi)
147	movq %rcx, -(`8` - CHAR_SIZE)(%END_REG)
148	ret
149
150
151	.p2align `4`,, `8`
152	L(more_1x_vec):
153	# if defined USE_AS_STPCPY \|\| defined USE_AS_STRCAT
154	VMOVU %VMM(`0`), (%rdi)
155	# endif
156	subq %rsi, %rdi
157	orq $(VEC_SIZE - `1`), %rsi
158	addq %rsi, %rdi
159	VMOVA `1`(%rsi), %VMM(`1`)
160
161	/ Try and order stores after as many loads as is reasonable to*
162	avoid potential false dependencies. /*
163	# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
164	VMOVU %VMM(`0`), (%rax)
165	# endif
166	VPCMPEQ %VMM(`1`), %VZERO, %VMM(`6`)
167	vpmovmskb %VMM(`6`), %ecx
168	testl %ecx, %ecx
169	jnz L(ret_vec_x1)
170
171	VMOVA (VEC_SIZE + `1`)(%rsi), %VMM(`2`)
172	VMOVU %VMM(`1`), `1`(%rdi)
173
174	VPCMPEQ %VMM(`2`), %VZERO, %VMM(`6`)
175	vpmovmskb %VMM(`6`), %ecx
176	testl %ecx, %ecx
177	jnz L(ret_vec_x2)
178
179	VMOVA (VEC_SIZE * `2` + `1`)(%rsi), %VMM(`3`)
180	VMOVU %VMM(`2`), (VEC_SIZE + `1`)(%rdi)
181
182	VPCMPEQ %VMM(`3`), %VZERO, %VMM(`6`)
183	vpmovmskb %VMM(`6`), %ecx
184	testl %ecx, %ecx
185	jnz L(ret_vec_x3)
186
187	VMOVA (VEC_SIZE * `3` + `1`)(%rsi), %VMM(`4`)
188	VMOVU %VMM(`3`), (VEC_SIZE * `2` + `1`)(%rdi)
189	VPCMPEQ %VMM(`4`), %VZERO, %VMM(`6`)
190	vpmovmskb %VMM(`6`), %edx
191	testl %edx, %edx
192	jnz L(ret_vec_x4)
193
194	VMOVU %VMM(`4`), (VEC_SIZE * `3` + `1`)(%rdi)
195
196	/ Subtract rsi from rdi before aligning. Adding back rsi will*
197	get proper rdi (dst) for new src. /*
198	subq %rsi, %rdi
199	incq %rsi
200	orq $(VEC_SIZE * `4` - `1`), %rsi
201
202	/ Do first half of loop ahead of time so loop can just start by*
203	storing. /*
204	VMOVA (VEC_SIZE * `0` + `1`)(%rsi), %VMM(`0`)
205	VMOVA (VEC_SIZE * `1` + `1`)(%rsi), %VMM(`1`)
206	VMOVA (VEC_SIZE * `2` + `1`)(%rsi), %VMM(`2`)
207	VMOVA (VEC_SIZE * `3` + `1`)(%rsi), %VMM(`3`)
208
209	VPMIN %VMM(`0`), %VMM(`1`), %VMM(`4`)
210	VPMIN %VMM(`2`), %VMM(`3`), %VMM(`6`)
211	VPMIN %VMM(`4`), %VMM(`6`), %VMM(`6`)
212	VPCMPEQ %VMM(`6`), %VZERO, %VMM(`6`)
213	vpmovmskb %VMM(`6`), %edx
214	addq %rsi, %rdi
215
216	testl %edx, %edx
217	jnz L(loop_4x_done)
218
219	.p2align `4`,, `11`
220	L(loop_4x_vec):
221
222	VMOVU %VMM(`0`), (VEC_SIZE * `0` + `1`)(%rdi)
223	VMOVU %VMM(`1`), (VEC_SIZE * `1` + `1`)(%rdi)
224	subq $(VEC_SIZE * -`4`), %rsi
225	VMOVU %VMM(`2`), (VEC_SIZE * `2` + `1`)(%rdi)
226	VMOVU %VMM(`3`), (VEC_SIZE * `3` + `1`)(%rdi)
227
228
229	VMOVA (VEC_SIZE * `0` + `1`)(%rsi), %VMM(`0`)
230	VMOVA (VEC_SIZE * `1` + `1`)(%rsi), %VMM(`1`)
231	VMOVA (VEC_SIZE * `2` + `1`)(%rsi), %VMM(`2`)
232	VMOVA (VEC_SIZE * `3` + `1`)(%rsi), %VMM(`3`)
233
234	VPMIN %VMM(`0`), %VMM(`1`), %VMM(`4`)
235	VPMIN %VMM(`2`), %VMM(`3`), %VMM(`6`)
236	VPMIN %VMM(`4`), %VMM(`6`), %VMM(`6`)
237	VPCMPEQ %VMM(`6`), %VZERO, %VMM(`6`)
238
239	vpmovmskb %VMM(`6`), %edx
240	subq $(VEC_SIZE * -`4`), %rdi
241	testl %edx, %edx
242	jz L(loop_4x_vec)
243
244	L(loop_4x_done):
245	VPCMPEQ %VMM(`0`), %VZERO, %VMM(`6`)
246	vpmovmskb %VMM(`6`), %ecx
247	testl %ecx, %ecx
248	jnz L(ret_vec_x1)
249	VMOVU %VMM(`0`), (VEC_SIZE * `0` + `1`)(%rdi)
250
251	VPCMPEQ %VMM(`1`), %VZERO, %VMM(`6`)
252	vpmovmskb %VMM(`6`), %ecx
253	testl %ecx, %ecx
254	jnz L(ret_vec_x2)
255	VMOVU %VMM(`1`), (VEC_SIZE * `1` + `1`)(%rdi)
256
257	VPCMPEQ %VMM(`2`), %VZERO, %VMM(`6`)
258	vpmovmskb %VMM(`6`), %ecx
259	testl %ecx, %ecx
260	jnz L(ret_vec_x3)
261	VMOVU %VMM(`2`), (VEC_SIZE * `2` + `1`)(%rdi)
262	L(ret_vec_x4):
263	bsfl %edx, %edx
264	VMOVU ((VEC_SIZE * `3` + `1`)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(`1`)
265	VMOVU %VMM(`1`), ((VEC_SIZE * `3` + `1`)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
266	# ifdef USE_AS_STPCPY
267	leaq (VEC_SIZE * `3` + `1`)(%rdx, %rdi), %rax
268	# endif
269	L(return_end):
270	VZEROUPPER_RETURN
271
272	.p2align `4`,, `8`
273	L(ret_vec_x1):
274	bsfl %ecx, %ecx
275	VMOVU (`1` -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(`1`)
276	VMOVU %VMM(`1`), (`1` -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
277	# ifdef USE_AS_STPCPY
278	leaq `1`(%rcx, %rdi), %rax
279	# endif
280	L(return_vzeroupper):
281	ZERO_UPPER_VEC_REGISTERS_RETURN
282
283	.p2align `4`,, `8`
284	L(ret_vec_x2):
285	bsfl %ecx, %ecx
286	VMOVU ((VEC_SIZE + `1`)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(`1`)
287	VMOVU %VMM(`1`), ((VEC_SIZE + `1`)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
288	# ifdef USE_AS_STPCPY
289	leaq (VEC_SIZE * `1` + `1`)(%rcx, %rdi), %rax
290	# endif
291	VZEROUPPER_RETURN
292
293	.p2align `4`,, `8`
294	L(ret_vec_x3):
295	bsfl %ecx, %ecx
296	VMOVU ((VEC_SIZE * `2` + `1`)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(`1`)
297	VMOVU %VMM(`1`), ((VEC_SIZE * `2` + `1`)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
298	# ifdef USE_AS_STPCPY
299	leaq (VEC_SIZE * `2` + `1`)(%rcx, %rdi), %rax
300	# endif
301	VZEROUPPER_RETURN
302
303
304	.p2align `4`,, `4`
305	L(page_cross):
306	movq %rsi, %rcx
307	andq $(VEC_SIZE * -`1`), %rcx
308
309	VPCMPEQ (%rcx), %VZERO, %VMM(`6`)
310	vpmovmskb %VMM(`6`), %ecx
311	shrxl %esi, %ecx, %ecx
312	# if USE_MOVSB_IN_PAGE_CROSS
313	/ Optimizing more aggressively for space as this is very cold*
314	code. This saves 2x cache lines. /*
315
316	/ This adds once to the later result which will get correct*
317	copy bounds. NB: this can never zero-out a non-zero RCX as
318	to be in the page cross case rsi cannot be aligned and we
319	already right-shift rcx by the misalignment. /*
320	shll $CHAR_SIZE, %ecx
321	jz L(page_cross_continue)
322	bsfl %ecx, %ecx
323	# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
324	movq %rdi, %rax
325	# endif
326	rep movsb
327	# ifdef USE_AS_STPCPY
328	leaq -CHAR_SIZE(%rdi), %rax
329	# endif
330
331	VZEROUPPER_RETURN
332
333	# else
334	testl %ecx, %ecx
335	jz L(page_cross_continue)
336
337	/ Traditional copy case, essentially same as used in non-page-*
338	cross case but since we can't reuse VMM(0) we need twice as
339	many loads from rsi. /*
340	# ifndef USE_AS_STRCAT
341	xorl %edx, %edx
342	# endif
343	bsfl %ecx, %edx
344	# ifdef USE_AS_STPCPY
345	leaq (%rdi, %rdx), %rax
346	# elif !defined USE_AS_STRCAT
347	movq %rdi, %rax
348	# endif
349
350	/ vzeroupper early to avoid duplicating at each return. /
351	COND_VZEROUPPER
352
353	testw %cx, %cx
354	jz L(page_cross_copy_16_31)
355
356	testb %cl, %cl
357	jz L(page_cross_copy_8_15)
358
359	testl $`0x7`, %cl
360	jz L(page_cross_copy_4_7)
361
362	testl %edx, %edx
363	jz L(page_cross_set_null_term)
364	movzwl (%rsi), %ecx
365	movw %cx, (%rdi)
366	L(page_cross_set_null_term):
367	movb $`0`, (%END_REG)
368	ret
369
370	.p2align `4`,, `4`
371	L(page_cross_copy_4_7):
372	movl (%rsi), %ecx
373	movl -`3`(%rsi, %rdx), %esi
374	movl %ecx, (%rdi)
375	movl %esi, -`3`(%END_REG)
376	ret
377
378	.p2align `4`,, `4`
379	L(page_cross_copy_8_15):
380	movq (%rsi), %rcx
381	movq -`7`(%rsi, %rdx), %rsi
382	movq %rcx, (%rdi)
383	movq %rsi, -`7`(%END_REG)
384	ret
385
386
387	.p2align `4`,, `3`
388	L(page_cross_copy_16_31):
389	VMOVU (%rsi), %xmm0
390	VMOVU -`15`(%rsi, %rdx), %xmm1
391	VMOVU %xmm0, (%rdi)
392	VMOVU %xmm1, -`15`(%END_REG)
393	ret
394	# endif
395
396	END(STRCPY)
397	#endif
398

source code of glibc/sysdeps/x86_64/multiarch/strcpy-avx2.S