strncat-avx2.S source code [glibc/sysdeps/x86_64/multiarch/strncat-avx2.S]

1	/ strncat with AVX2*
2	Copyright (C) 2022-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20
21	#if ISA_SHOULD_BUILD (3)
22
23	# include <sysdep.h>
24
25	# ifndef VEC_SIZE
26	# include "x86-avx-vecs.h"
27	# endif
28
29	# ifndef STRNCAT
30	# define STRNCAT __strncat_avx2
31	# endif
32
33	# ifdef USE_AS_WCSCPY
34	# define MOVCHAR movl
35	# define VPCMPEQ vpcmpeqd
36	# define VPMIN vpminud
37	# define CHAR_SIZE 4
38	# else
39	# define MOVCHAR movb
40	# define VPCMPEQ vpcmpeqb
41	# define VPMIN vpminub
42	# define CHAR_SIZE 1
43	# endif
44
45	# include "strncpy-or-cat-overflow-def.h"
46
47	# define PAGE_SIZE 4096
48
49	# define VZERO VMM(7)
50	# define VZERO_128 VMM_128(7)
51
52	.section SECTION(.text), "ax", @progbits
53	ENTRY(STRNCAT)
54	# ifdef __ILP32__
55	/ Clear the upper 32 bits. /
56	movl %edx, %edx
57	# endif
58	/ Filter zero length strings and very long strings. Zero*
59	length strings just return, very long strings are handled by
60	using the non-length variant {wcs\|str}cat. /*
61	movq %rdi, %rax
62	# ifdef USE_AS_WCSCPY
63	leaq -`1`(%rdx), %rcx
64	shr $`56`, %rcx
65	jnz L(zero_len)
66	salq $`2`, %rdx
67	# else
68	test %rdx, %rdx
69	jle L(zero_len)
70	# endif
71	vpxor %VZERO_128, %VZERO_128, %VZERO_128
72
73	# include "strcat-strlen-avx2.h.S"
74
75	movl %esi, %ecx
76	andl $(PAGE_SIZE - `1`), %ecx
77	cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
78	ja L(page_cross)
79	L(page_cross_continue):
80	VMOVU (%rsi), %VMM(`0`)
81	VPCMPEQ %VMM(`0`), %VZERO, %VMM(`6`)
82	vpmovmskb %VMM(`6`), %ecx
83
84	tzcnt %ecx, %r8d
85	cmpq %r8, %rdx
86	jbe L(less_1x_vec)
87
88	testl %ecx, %ecx
89	jz L(more_1x_vec)
90
91	/ Hoist this to save code size. /
92
93	movl %r8d, %edx
94
95	L(less_1x_vec):
96	COND_VZEROUPPER
97
98	cmpl $`16`, %edx
99	jae L(copy_16_31)
100	cmpl $`8`, %edx
101	jae L(copy_8_15)
102
103
104	# ifdef USE_AS_WCSCPY
105	vmovd %VMM_128(`0`), (%rdi)
106	MOVCHAR $`0`, (%rdi, %rdx)
107	ret
108	# else
109	cmpl $`4`, %edx
110	jae L(copy_4_7)
111
112	movzbl (%rsi), %ecx
113	cmpl $`1`, %edx
114	jbe L(set_null_term)
115
116	/ NB: make this `vmovw` if support for AVX512-FP16 is added.*
117	*/
118	movzwl `1`(%rsi), %esi
119	movw %si, `1`(%rdi)
120
121	.p2align `4`,, `1`
122	L(set_null_term):
123	movb %cl, (%rdi)
124	MOVCHAR $`0`, (%rdi, %rdx)
125	ret
126
127	.p2align `4`,, `11`
128	L(copy_4_7):
129	movl -(`4`)(%rsi, %rdx), %ecx
130	vmovd %xmm0, (%rdi)
131	movl %ecx, -(`4`)(%rdi, %rdx)
132	MOVCHAR $`0`, (%rdi, %rdx)
133	ret
134	# endif
135
136
137	.p2align `4`,, `10`
138	L(copy_16_31):
139	VMOVU -(`16`)(%rsi, %rdx), %xmm1
140	VMOVU %xmm0, (%rdi)
141	VMOVU %xmm1, -(`16`)(%rdi, %rdx)
142	MOVCHAR $`0`, (%rdi, %rdx)
143	ret
144
145	.p2align `4`,, `10`
146	L(copy_8_15):
147	movq -(`8`)(%rsi, %rdx), %rcx
148	vmovq %xmm0, (%rdi)
149	movq %rcx, -(`8`)(%rdi, %rdx)
150	MOVCHAR $`0`, (%rdi, %rdx)
151	ret
152
153	.p2align `4`,, `8`
154	.p2align `6`,, `14`
155	L(more_1x_vec):
156	VMOVU %VMM(`0`), (%rdi)
157
158	/ Align rsi (src) and just rdx/rdi (length/dst). /
159	addq %rsi, %rdx
160	subq %rsi, %rdi
161	orq $(VEC_SIZE - `1`), %rsi
162	incq %rsi
163	addq %rsi, %rdi
164	L(loop_last_4x_vec):
165	subq %rsi, %rdx
166	VMOVA `0`(%rsi), %VMM(`1`)
167	VPCMPEQ %VMM(`1`), %VZERO, %VMM(`6`)
168	vpmovmskb %VMM(`6`), %ecx
169	cmpq $(VEC_SIZE * `2`), %rdx
170	ja L(more_2x_vec)
171	L(last_2x_vec):
172	tzcnt %ecx, %ecx
173	cmpl %ecx, %edx
174	jbe L(ret_vec_x1_len)
175
176	cmpl $VEC_SIZE, %ecx
177	jnz L(ret_vec_x1)
178
179	VMOVA (VEC_SIZE * `1`)(%rsi), %VMM(`2`)
180	VMOVU %VMM(`1`), (%rdi)
181	VPCMPEQ %VMM(`2`), %VZERO, %VMM(`6`)
182	vpmovmskb %VMM(`6`), %ecx
183	addl $-VEC_SIZE, %edx
184	bzhil %edx, %ecx, %r8d
185	jz L(ret_vec_x2_len)
186	L(ret_vec_x2):
187	bsfl %ecx, %edx
188	L(ret_vec_x2_len):
189	VMOVU (%rsi, %rdx), %VMM(`0`)
190	MOVCHAR $`0`, (VEC_SIZE)(%rdi, %rdx)
191	VMOVU %VMM(`0`), (%rdi, %rdx)
192	L(return_vzeroupper):
193	ZERO_UPPER_VEC_REGISTERS_RETURN
194
195
196	.p2align `4`,, `12`
197	L(ret_vec_x1_len):
198	movl %edx, %ecx
199	L(ret_vec_x1):
200	VMOVU -(VEC_SIZE)(%rsi, %rcx), %VMM(`1`)
201	MOVCHAR $`0`, (%rdi, %rcx)
202	VMOVU %VMM(`1`), -VEC_SIZE(%rdi, %rcx)
203	VZEROUPPER_RETURN
204
205	.p2align `4`,, `8`
206	L(last_4x_vec):
207	subq $-(VEC_SIZE * `4`), %rsi
208	VMOVA `0`(%rsi), %VMM(`1`)
209	VPCMPEQ %VMM(`1`), %VZERO, %VMM(`6`)
210	vpmovmskb %VMM(`6`), %ecx
211	subq $-(VEC_SIZE * `4`), %rdi
212	addl $-(VEC_SIZE * `4`), %edx
213	cmpl $(VEC_SIZE * `2`), %edx
214	jbe L(last_2x_vec)
215	.p2align `4`,, `8`
216	L(more_2x_vec):
217	/ L(ret_vec_x1) expects ecx to have position of first match so*
218	test with bsf. /*
219	bsfl %ecx, %ecx
220	jnz L(ret_vec_x1)
221
222	VMOVA (VEC_SIZE * `1`)(%rsi), %VMM(`2`)
223	VMOVU %VMM(`1`), (%rdi)
224
225	VPCMPEQ %VMM(`2`), %VZERO, %VMM(`6`)
226	vpmovmskb %VMM(`6`), %ecx
227	testl %ecx, %ecx
228	jnz L(ret_vec_x2)
229
230
231	VMOVA (VEC_SIZE * `2`)(%rsi), %VMM(`3`)
232	VMOVU %VMM(`2`), (VEC_SIZE * `1`)(%rdi)
233
234	VPCMPEQ %VMM(`3`), %VZERO, %VMM(`6`)
235	vpmovmskb %VMM(`6`), %ecx
236
237	/ Check if length is greater than 4x VEC. /
238	cmpq $(VEC_SIZE * `4`), %rdx
239	ja L(more_4x_vec)
240
241	addl $(VEC_SIZE * -`2`), %edx
242
243	tzcnt %ecx, %ecx
244	cmpl %ecx, %edx
245	jbe L(ret_vec_x3_len)
246
247	cmpl $VEC_SIZE, %ecx
248	jnz L(ret_vec_x3)
249
250	VMOVA (VEC_SIZE * `3` + `0`)(%rsi), %VMM(`4`)
251	VMOVU %VMM(`3`), (VEC_SIZE * `2` + `0`)(%rdi)
252	VPCMPEQ %VMM(`4`), %VZERO, %VMM(`6`)
253	vpmovmskb %VMM(`6`), %ecx
254	addl $-VEC_SIZE, %edx
255	bzhil %edx, %ecx, %r8d
256	jz L(ret_vec_x4_len)
257	L(ret_vec_x4):
258	bsfl %ecx, %edx
259	L(ret_vec_x4_len):
260	VMOVU (VEC_SIZE * `2`)(%rsi, %rdx), %VMM(`0`)
261	MOVCHAR $`0`, (VEC_SIZE * `3`)(%rdi, %rdx)
262	VMOVU %VMM(`0`), (VEC_SIZE * `2`)(%rdi, %rdx)
263	VZEROUPPER_RETURN
264
265	.p2align `4`,, `4`
266	L(ret_vec_x3_len):
267	movl %edx, %ecx
268	L(ret_vec_x3):
269	VMOVU (VEC_SIZE)(%rsi, %rcx), %VMM(`0`)
270	MOVCHAR $`0`, (VEC_SIZE * `2`)(%rdi, %rcx)
271	VMOVU %VMM(`0`), (VEC_SIZE)(%rdi, %rcx)
272	VZEROUPPER_RETURN
273
274
275	.p2align `4`,, `8`
276	L(more_4x_vec):
277	bsfl %ecx, %ecx
278	jnz L(ret_vec_x3)
279
280	VMOVA (VEC_SIZE * `3`)(%rsi), %VMM(`4`)
281	VMOVU %VMM(`3`), (VEC_SIZE * `2`)(%rdi)
282	VPCMPEQ %VMM(`4`), %VZERO, %VMM(`6`)
283	vpmovmskb %VMM(`6`), %ecx
284	testl %ecx, %ecx
285	jnz L(ret_vec_x4)
286
287	VMOVU %VMM(`4`), (VEC_SIZE * `3`)(%rdi)
288
289
290	/ Recheck length before aligning. /
291	cmpq $(VEC_SIZE * `8`), %rdx
292	jbe L(last_4x_vec)
293
294	/ Align rsi (src) and just rdx/rdi (length/dst). /
295	addq %rsi, %rdx
296	subq %rsi, %rdi
297	subq $-(VEC_SIZE * `4`), %rsi
298	andq $(VEC_SIZE * -`4`), %rsi
299
300	/ Do first half of loop ahead of time so loop can just start by*
301	storing. /*
302	VMOVA (VEC_SIZE * `0` + `0`)(%rsi), %VMM(`0`)
303	VMOVA (VEC_SIZE * `1` + `0`)(%rsi), %VMM(`1`)
304	VMOVA (VEC_SIZE * `2` + `0`)(%rsi), %VMM(`2`)
305	VMOVA (VEC_SIZE * `3` + `0`)(%rsi), %VMM(`3`)
306
307	VPMIN %VMM(`0`), %VMM(`1`), %VMM(`4`)
308	VPMIN %VMM(`2`), %VMM(`3`), %VMM(`6`)
309	VPMIN %VMM(`4`), %VMM(`6`), %VMM(`6`)
310	VPCMPEQ %VMM(`6`), %VZERO, %VMM(`6`)
311	vpmovmskb %VMM(`6`), %r8d
312	addq %rsi, %rdi
313	testl %r8d, %r8d
314	jnz L(loop_4x_done)
315
316	/ Use r9 for end of region before handling last 4x VEC*
317	specially. /*
318	leaq -(VEC_SIZE * `4`)(%rdx), %r9
319
320	.p2align `4`,, `11`
321	L(loop_4x_vec):
322
323	VMOVU %VMM(`0`), (VEC_SIZE * `0` + `0`)(%rdi)
324	VMOVU %VMM(`1`), (VEC_SIZE * `1` + `0`)(%rdi)
325	subq $(VEC_SIZE * -`4`), %rsi
326	VMOVU %VMM(`2`), (VEC_SIZE * `2` + `0`)(%rdi)
327	VMOVU %VMM(`3`), (VEC_SIZE * `3` + `0`)(%rdi)
328
329	subq $(VEC_SIZE * -`4`), %rdi
330	cmpq %rsi, %r9
331	jbe L(loop_last_4x_vec)
332
333	VMOVA (VEC_SIZE * `0` + `0`)(%rsi), %VMM(`0`)
334	VMOVA (VEC_SIZE * `1` + `0`)(%rsi), %VMM(`1`)
335	VMOVA (VEC_SIZE * `2` + `0`)(%rsi), %VMM(`2`)
336	VMOVA (VEC_SIZE * `3` + `0`)(%rsi), %VMM(`3`)
337
338	VPMIN %VMM(`0`), %VMM(`1`), %VMM(`4`)
339	VPMIN %VMM(`2`), %VMM(`3`), %VMM(`6`)
340	VPMIN %VMM(`4`), %VMM(`6`), %VMM(`6`)
341	VPCMPEQ %VMM(`6`), %VZERO, %VMM(`6`)
342
343	vpmovmskb %VMM(`6`), %r8d
344
345	testl %r8d, %r8d
346	jz L(loop_4x_vec)
347
348	L(loop_4x_done):
349	VPCMPEQ %VMM(`0`), %VZERO, %VMM(`6`)
350	vpmovmskb %VMM(`6`), %ecx
351	/ L(ret_vec_x1) expects ecx to have position of first match so*
352	test with bsf. /*
353	bsfl %ecx, %ecx
354	jnz L(ret_vec_x1)
355	VMOVU %VMM(`0`), (VEC_SIZE * `0` + `0`)(%rdi)
356
357	VPCMPEQ %VMM(`1`), %VZERO, %VMM(`6`)
358	vpmovmskb %VMM(`6`), %ecx
359
360	testl %ecx, %ecx
361	jnz L(ret_vec_x2)
362	VMOVU %VMM(`1`), (VEC_SIZE * `1` + `0`)(%rdi)
363
364	VPCMPEQ %VMM(`2`), %VZERO, %VMM(`6`)
365	vpmovmskb %VMM(`6`), %ecx
366	bsfl %ecx, %ecx
367	jnz L(ret_vec_x3)
368
369	VMOVU %VMM(`2`), (VEC_SIZE * `2` + `0`)(%rdi)
370	bsfl %r8d, %r8d
371	VMOVU (VEC_SIZE * `2` + CHAR_SIZE)(%rsi, %r8), %VMM(`1`)
372	VMOVU %VMM(`1`), (VEC_SIZE * `2` + CHAR_SIZE)(%rdi, %r8)
373	VZEROUPPER_RETURN
374
375
376
377	.p2align `4`,, `4`
378	L(page_cross):
379	movq %rsi, %r8
380	andq $(VEC_SIZE * -`1`), %r8
381
382	VPCMPEQ (%r8), %VZERO, %VMM(`6`)
383
384	vpmovmskb %VMM(`6`), %ecx
385	shrxl %esi, %ecx, %ecx
386
387	subl %esi, %r8d
388	andl $(VEC_SIZE - `1`), %r8d
389	cmpq %r8, %rdx
390	jbe L(page_cross_small)
391
392	/ Optimizing more aggressively for space as this is very cold*
393	code. This saves 2x cache lines. /*
394
395	/ This adds once to the later result which will get correct*
396	copy bounds. NB: this can never zero-out a non-zero RCX as
397	to be in the page cross case rsi cannot be aligned and we
398	already right-shift rcx by the misalignment. /*
399	shll $CHAR_SIZE, %ecx
400	jz L(page_cross_continue)
401	bsfl %ecx, %ecx
402	rep movsb
403	VZEROUPPER_RETURN
404
405	L(page_cross_small):
406	tzcntl %ecx, %ecx
407	jz L(page_cross_setz)
408	cmpl %edx, %ecx
409	cmova %edx, %ecx
410	rep movsb
411	L(page_cross_setz):
412	MOVCHAR $`0`, (%rdi)
413	VZEROUPPER_RETURN
414	L(zero_len):
415	# ifdef USE_AS_WCSCPY
416	test %rdx, %rdx
417	# endif
418	jnz OVERFLOW_STRCAT
419	ret
420
421
422	END(STRNCAT)
423	#endif
424

source code of glibc/sysdeps/x86_64/multiarch/strncat-avx2.S