1/* strncat with AVX2
2 Copyright (C) 2022-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (3)
22
23# include <sysdep.h>
24
25# ifndef VEC_SIZE
26# include "x86-avx-vecs.h"
27# endif
28
29# ifndef STRNCAT
30# define STRNCAT __strncat_avx2
31# endif
32
33# ifdef USE_AS_WCSCPY
34# define MOVCHAR movl
35# define VPCMPEQ vpcmpeqd
36# define VPMIN vpminud
37# define CHAR_SIZE 4
38# else
39# define MOVCHAR movb
40# define VPCMPEQ vpcmpeqb
41# define VPMIN vpminub
42# define CHAR_SIZE 1
43# endif
44
45# include "strncpy-or-cat-overflow-def.h"
46
47# define PAGE_SIZE 4096
48
49# define VZERO VMM(7)
50# define VZERO_128 VMM_128(7)
51
52 .section SECTION(.text), "ax", @progbits
53ENTRY(STRNCAT)
54# ifdef __ILP32__
55 /* Clear the upper 32 bits. */
56 movl %edx, %edx
57# endif
58 /* Filter zero length strings and very long strings. Zero
59 length strings just return, very long strings are handled by
60 using the non-length variant {wcs|str}cat. */
61 movq %rdi, %rax
62# ifdef USE_AS_WCSCPY
63 leaq -1(%rdx), %rcx
64 shr $56, %rcx
65 jnz L(zero_len)
66 salq $2, %rdx
67# else
68 test %rdx, %rdx
69 jle L(zero_len)
70# endif
71 vpxor %VZERO_128, %VZERO_128, %VZERO_128
72
73# include "strcat-strlen-avx2.h.S"
74
75 movl %esi, %ecx
76 andl $(PAGE_SIZE - 1), %ecx
77 cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
78 ja L(page_cross)
79L(page_cross_continue):
80 VMOVU (%rsi), %VMM(0)
81 VPCMPEQ %VMM(0), %VZERO, %VMM(6)
82 vpmovmskb %VMM(6), %ecx
83
84 tzcnt %ecx, %r8d
85 cmpq %r8, %rdx
86 jbe L(less_1x_vec)
87
88 testl %ecx, %ecx
89 jz L(more_1x_vec)
90
91 /* Hoist this to save code size. */
92
93 movl %r8d, %edx
94
95L(less_1x_vec):
96 COND_VZEROUPPER
97
98 cmpl $16, %edx
99 jae L(copy_16_31)
100 cmpl $8, %edx
101 jae L(copy_8_15)
102
103
104# ifdef USE_AS_WCSCPY
105 vmovd %VMM_128(0), (%rdi)
106 MOVCHAR $0, (%rdi, %rdx)
107 ret
108# else
109 cmpl $4, %edx
110 jae L(copy_4_7)
111
112 movzbl (%rsi), %ecx
113 cmpl $1, %edx
114 jbe L(set_null_term)
115
116 /* NB: make this `vmovw` if support for AVX512-FP16 is added.
117 */
118 movzwl 1(%rsi), %esi
119 movw %si, 1(%rdi)
120
121 .p2align 4,, 1
122L(set_null_term):
123 movb %cl, (%rdi)
124 MOVCHAR $0, (%rdi, %rdx)
125 ret
126
127 .p2align 4,, 11
128L(copy_4_7):
129 movl -(4)(%rsi, %rdx), %ecx
130 vmovd %xmm0, (%rdi)
131 movl %ecx, -(4)(%rdi, %rdx)
132 MOVCHAR $0, (%rdi, %rdx)
133 ret
134# endif
135
136
137 .p2align 4,, 10
138L(copy_16_31):
139 VMOVU -(16)(%rsi, %rdx), %xmm1
140 VMOVU %xmm0, (%rdi)
141 VMOVU %xmm1, -(16)(%rdi, %rdx)
142 MOVCHAR $0, (%rdi, %rdx)
143 ret
144
145 .p2align 4,, 10
146L(copy_8_15):
147 movq -(8)(%rsi, %rdx), %rcx
148 vmovq %xmm0, (%rdi)
149 movq %rcx, -(8)(%rdi, %rdx)
150 MOVCHAR $0, (%rdi, %rdx)
151 ret
152
153 .p2align 4,, 8
154 .p2align 6,, 14
155L(more_1x_vec):
156 VMOVU %VMM(0), (%rdi)
157
158 /* Align rsi (src) and just rdx/rdi (length/dst). */
159 addq %rsi, %rdx
160 subq %rsi, %rdi
161 orq $(VEC_SIZE - 1), %rsi
162 incq %rsi
163 addq %rsi, %rdi
164L(loop_last_4x_vec):
165 subq %rsi, %rdx
166 VMOVA 0(%rsi), %VMM(1)
167 VPCMPEQ %VMM(1), %VZERO, %VMM(6)
168 vpmovmskb %VMM(6), %ecx
169 cmpq $(VEC_SIZE * 2), %rdx
170 ja L(more_2x_vec)
171L(last_2x_vec):
172 tzcnt %ecx, %ecx
173 cmpl %ecx, %edx
174 jbe L(ret_vec_x1_len)
175
176 cmpl $VEC_SIZE, %ecx
177 jnz L(ret_vec_x1)
178
179 VMOVA (VEC_SIZE * 1)(%rsi), %VMM(2)
180 VMOVU %VMM(1), (%rdi)
181 VPCMPEQ %VMM(2), %VZERO, %VMM(6)
182 vpmovmskb %VMM(6), %ecx
183 addl $-VEC_SIZE, %edx
184 bzhil %edx, %ecx, %r8d
185 jz L(ret_vec_x2_len)
186L(ret_vec_x2):
187 bsfl %ecx, %edx
188L(ret_vec_x2_len):
189 VMOVU (%rsi, %rdx), %VMM(0)
190 MOVCHAR $0, (VEC_SIZE)(%rdi, %rdx)
191 VMOVU %VMM(0), (%rdi, %rdx)
192L(return_vzeroupper):
193 ZERO_UPPER_VEC_REGISTERS_RETURN
194
195
196 .p2align 4,, 12
197L(ret_vec_x1_len):
198 movl %edx, %ecx
199L(ret_vec_x1):
200 VMOVU -(VEC_SIZE)(%rsi, %rcx), %VMM(1)
201 MOVCHAR $0, (%rdi, %rcx)
202 VMOVU %VMM(1), -VEC_SIZE(%rdi, %rcx)
203 VZEROUPPER_RETURN
204
205 .p2align 4,, 8
206L(last_4x_vec):
207 subq $-(VEC_SIZE * 4), %rsi
208 VMOVA 0(%rsi), %VMM(1)
209 VPCMPEQ %VMM(1), %VZERO, %VMM(6)
210 vpmovmskb %VMM(6), %ecx
211 subq $-(VEC_SIZE * 4), %rdi
212 addl $-(VEC_SIZE * 4), %edx
213 cmpl $(VEC_SIZE * 2), %edx
214 jbe L(last_2x_vec)
215 .p2align 4,, 8
216L(more_2x_vec):
217 /* L(ret_vec_x1) expects ecx to have position of first match so
218 test with bsf. */
219 bsfl %ecx, %ecx
220 jnz L(ret_vec_x1)
221
222 VMOVA (VEC_SIZE * 1)(%rsi), %VMM(2)
223 VMOVU %VMM(1), (%rdi)
224
225 VPCMPEQ %VMM(2), %VZERO, %VMM(6)
226 vpmovmskb %VMM(6), %ecx
227 testl %ecx, %ecx
228 jnz L(ret_vec_x2)
229
230
231 VMOVA (VEC_SIZE * 2)(%rsi), %VMM(3)
232 VMOVU %VMM(2), (VEC_SIZE * 1)(%rdi)
233
234 VPCMPEQ %VMM(3), %VZERO, %VMM(6)
235 vpmovmskb %VMM(6), %ecx
236
237 /* Check if length is greater than 4x VEC. */
238 cmpq $(VEC_SIZE * 4), %rdx
239 ja L(more_4x_vec)
240
241 addl $(VEC_SIZE * -2), %edx
242
243 tzcnt %ecx, %ecx
244 cmpl %ecx, %edx
245 jbe L(ret_vec_x3_len)
246
247 cmpl $VEC_SIZE, %ecx
248 jnz L(ret_vec_x3)
249
250 VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(4)
251 VMOVU %VMM(3), (VEC_SIZE * 2 + 0)(%rdi)
252 VPCMPEQ %VMM(4), %VZERO, %VMM(6)
253 vpmovmskb %VMM(6), %ecx
254 addl $-VEC_SIZE, %edx
255 bzhil %edx, %ecx, %r8d
256 jz L(ret_vec_x4_len)
257L(ret_vec_x4):
258 bsfl %ecx, %edx
259L(ret_vec_x4_len):
260 VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(0)
261 MOVCHAR $0, (VEC_SIZE * 3)(%rdi, %rdx)
262 VMOVU %VMM(0), (VEC_SIZE * 2)(%rdi, %rdx)
263 VZEROUPPER_RETURN
264
265 .p2align 4,, 4
266L(ret_vec_x3_len):
267 movl %edx, %ecx
268L(ret_vec_x3):
269 VMOVU (VEC_SIZE)(%rsi, %rcx), %VMM(0)
270 MOVCHAR $0, (VEC_SIZE * 2)(%rdi, %rcx)
271 VMOVU %VMM(0), (VEC_SIZE)(%rdi, %rcx)
272 VZEROUPPER_RETURN
273
274
275 .p2align 4,, 8
276L(more_4x_vec):
277 bsfl %ecx, %ecx
278 jnz L(ret_vec_x3)
279
280 VMOVA (VEC_SIZE * 3)(%rsi), %VMM(4)
281 VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
282 VPCMPEQ %VMM(4), %VZERO, %VMM(6)
283 vpmovmskb %VMM(6), %ecx
284 testl %ecx, %ecx
285 jnz L(ret_vec_x4)
286
287 VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi)
288
289
290 /* Recheck length before aligning. */
291 cmpq $(VEC_SIZE * 8), %rdx
292 jbe L(last_4x_vec)
293
294 /* Align rsi (src) and just rdx/rdi (length/dst). */
295 addq %rsi, %rdx
296 subq %rsi, %rdi
297 subq $-(VEC_SIZE * 4), %rsi
298 andq $(VEC_SIZE * -4), %rsi
299
300 /* Do first half of loop ahead of time so loop can just start by
301 storing. */
302 VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
303 VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
304 VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
305 VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
306
307 VPMIN %VMM(0), %VMM(1), %VMM(4)
308 VPMIN %VMM(2), %VMM(3), %VMM(6)
309 VPMIN %VMM(4), %VMM(6), %VMM(6)
310 VPCMPEQ %VMM(6), %VZERO, %VMM(6)
311 vpmovmskb %VMM(6), %r8d
312 addq %rsi, %rdi
313 testl %r8d, %r8d
314 jnz L(loop_4x_done)
315
316 /* Use r9 for end of region before handling last 4x VEC
317 specially. */
318 leaq -(VEC_SIZE * 4)(%rdx), %r9
319
320 .p2align 4,, 11
321L(loop_4x_vec):
322
323 VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
324 VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
325 subq $(VEC_SIZE * -4), %rsi
326 VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
327 VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
328
329 subq $(VEC_SIZE * -4), %rdi
330 cmpq %rsi, %r9
331 jbe L(loop_last_4x_vec)
332
333 VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
334 VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
335 VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
336 VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
337
338 VPMIN %VMM(0), %VMM(1), %VMM(4)
339 VPMIN %VMM(2), %VMM(3), %VMM(6)
340 VPMIN %VMM(4), %VMM(6), %VMM(6)
341 VPCMPEQ %VMM(6), %VZERO, %VMM(6)
342
343 vpmovmskb %VMM(6), %r8d
344
345 testl %r8d, %r8d
346 jz L(loop_4x_vec)
347
348L(loop_4x_done):
349 VPCMPEQ %VMM(0), %VZERO, %VMM(6)
350 vpmovmskb %VMM(6), %ecx
351 /* L(ret_vec_x1) expects ecx to have position of first match so
352 test with bsf. */
353 bsfl %ecx, %ecx
354 jnz L(ret_vec_x1)
355 VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
356
357 VPCMPEQ %VMM(1), %VZERO, %VMM(6)
358 vpmovmskb %VMM(6), %ecx
359
360 testl %ecx, %ecx
361 jnz L(ret_vec_x2)
362 VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
363
364 VPCMPEQ %VMM(2), %VZERO, %VMM(6)
365 vpmovmskb %VMM(6), %ecx
366 bsfl %ecx, %ecx
367 jnz L(ret_vec_x3)
368
369 VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
370 bsfl %r8d, %r8d
371 VMOVU (VEC_SIZE * 2 + CHAR_SIZE)(%rsi, %r8), %VMM(1)
372 VMOVU %VMM(1), (VEC_SIZE * 2 + CHAR_SIZE)(%rdi, %r8)
373 VZEROUPPER_RETURN
374
375
376
377 .p2align 4,, 4
378L(page_cross):
379 movq %rsi, %r8
380 andq $(VEC_SIZE * -1), %r8
381
382 VPCMPEQ (%r8), %VZERO, %VMM(6)
383
384 vpmovmskb %VMM(6), %ecx
385 shrxl %esi, %ecx, %ecx
386
387 subl %esi, %r8d
388 andl $(VEC_SIZE - 1), %r8d
389 cmpq %r8, %rdx
390 jbe L(page_cross_small)
391
392 /* Optimizing more aggressively for space as this is very cold
393 code. This saves 2x cache lines. */
394
395 /* This adds once to the later result which will get correct
396 copy bounds. NB: this can never zero-out a non-zero RCX as
397 to be in the page cross case rsi cannot be aligned and we
398 already right-shift rcx by the misalignment. */
399 shll $CHAR_SIZE, %ecx
400 jz L(page_cross_continue)
401 bsfl %ecx, %ecx
402 rep movsb
403 VZEROUPPER_RETURN
404
405L(page_cross_small):
406 tzcntl %ecx, %ecx
407 jz L(page_cross_setz)
408 cmpl %edx, %ecx
409 cmova %edx, %ecx
410 rep movsb
411L(page_cross_setz):
412 MOVCHAR $0, (%rdi)
413 VZEROUPPER_RETURN
414L(zero_len):
415# ifdef USE_AS_WCSCPY
416 test %rdx, %rdx
417# endif
418 jnz OVERFLOW_STRCAT
419 ret
420
421
422END(STRNCAT)
423#endif
424

source code of glibc/sysdeps/x86_64/multiarch/strncat-avx2.S