1/* strcpy with AVX2
2 Copyright (C) 2011-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (3)
22
23# include <sysdep.h>
24
25# ifndef VEC_SIZE
26# include "x86-avx-vecs.h"
27# endif
28
29# ifndef STRCPY
30# define STRCPY __strcpy_avx2
31# endif
32
33 /* Use movsb in page cross case to save code size. */
34# define USE_MOVSB_IN_PAGE_CROSS 1
35
36# ifdef USE_AS_WCSCPY
37# define VPCMPEQ vpcmpeqd
38# define VPMIN vpminud
39# define CHAR_SIZE 4
40# else
41# define VPCMPEQ vpcmpeqb
42# define VPMIN vpminub
43# define CHAR_SIZE 1
44# endif
45
46# define PAGE_SIZE 4096
47
48# ifdef USE_AS_STPCPY
49# define END_REG rax
50# else
51# define END_REG rdi, %rdx
52# endif
53
54# ifdef USE_AS_STRCAT
55# define PAGE_ALIGN_REG ecx
56# else
57# define PAGE_ALIGN_REG eax
58# endif
59
60# define VZERO VMM(7)
61# define VZERO_128 VMM_128(7)
62
63 .section SECTION(.text), "ax", @progbits
64ENTRY(STRCPY)
65 vpxor %VZERO_128, %VZERO_128, %VZERO_128
66
67# ifdef USE_AS_STRCAT
68 movq %rdi, %rax
69# include "strcat-strlen-avx2.h.S"
70# endif
71
72 movl %esi, %PAGE_ALIGN_REG
73 andl $(PAGE_SIZE - 1), %PAGE_ALIGN_REG
74 cmpl $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
75 ja L(page_cross)
76L(page_cross_continue):
77# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
78 movq %rdi, %rax
79# endif
80 VMOVU (%rsi), %VMM(0)
81 VPCMPEQ %VMM(0), %VZERO, %VMM(6)
82 vpmovmskb %VMM(6), %ecx
83
84 testl %ecx, %ecx
85 jz L(more_1x_vec)
86
87 /* No longer need ymm registers so just vzeroupper so it doesn't
88 need to be duplicated at each return statement. */
89 COND_VZEROUPPER
90
91 xorl %edx, %edx
92 bsfl %ecx, %edx
93# ifdef USE_AS_STPCPY
94 leaq (%rdi, %rdx), %rax
95# endif
96
97 /* Use mask bits in rcx to detect which copy we need. If the low
98 mask is zero then there must be a bit set in the upper half.
99 I.e if ecx != 0 and cx == 0, then match must be upper 16
100 bits so we use L(copy_16_31). */
101 testw %cx, %cx
102 jz L(copy_16_31)
103
104 testb %cl, %cl
105 jz L(copy_8_15)
106# ifdef USE_AS_WCSCPY
107 vmovd %xmm0, (%rdi)
108 movl $0, (%END_REG)
109 ret
110# else
111 testb $0x7, %cl
112 jz L(copy_4_7)
113
114 testl %edx, %edx
115 jz L(set_null_term)
116 vmovd %xmm0, %ecx
117 movw %cx, (%rdi)
118
119 .p2align 4,, 2
120L(set_null_term):
121 movb $0, (%END_REG)
122 ret
123
124 .p2align 4,, 12
125L(copy_4_7):
126 movl -3(%rsi, %rdx), %ecx
127 vmovd %xmm0, (%rdi)
128 movl %ecx, -3(%END_REG)
129 ret
130# endif
131
132 .p2align 4,, 10
133L(copy_16_31):
134 VMOVU -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
135 VMOVU %xmm0, (%rdi)
136 VMOVU %xmm1, -(16 - CHAR_SIZE)(%END_REG)
137 ret
138
139 .p2align 4,, 10
140L(copy_8_15):
141# ifdef USE_AS_WCSCPY
142 movl -(8 - CHAR_SIZE)(%rsi, %rdx), %ecx
143# else
144 movq -(8 - CHAR_SIZE)(%rsi, %rdx), %rcx
145# endif
146 vmovq %xmm0, (%rdi)
147 movq %rcx, -(8 - CHAR_SIZE)(%END_REG)
148 ret
149
150
151 .p2align 4,, 8
152L(more_1x_vec):
153# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
154 VMOVU %VMM(0), (%rdi)
155# endif
156 subq %rsi, %rdi
157 orq $(VEC_SIZE - 1), %rsi
158 addq %rsi, %rdi
159 VMOVA 1(%rsi), %VMM(1)
160
161 /* Try and order stores after as many loads as is reasonable to
162 avoid potential false dependencies. */
163# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
164 VMOVU %VMM(0), (%rax)
165# endif
166 VPCMPEQ %VMM(1), %VZERO, %VMM(6)
167 vpmovmskb %VMM(6), %ecx
168 testl %ecx, %ecx
169 jnz L(ret_vec_x1)
170
171 VMOVA (VEC_SIZE + 1)(%rsi), %VMM(2)
172 VMOVU %VMM(1), 1(%rdi)
173
174 VPCMPEQ %VMM(2), %VZERO, %VMM(6)
175 vpmovmskb %VMM(6), %ecx
176 testl %ecx, %ecx
177 jnz L(ret_vec_x2)
178
179 VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(3)
180 VMOVU %VMM(2), (VEC_SIZE + 1)(%rdi)
181
182 VPCMPEQ %VMM(3), %VZERO, %VMM(6)
183 vpmovmskb %VMM(6), %ecx
184 testl %ecx, %ecx
185 jnz L(ret_vec_x3)
186
187 VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(4)
188 VMOVU %VMM(3), (VEC_SIZE * 2 + 1)(%rdi)
189 VPCMPEQ %VMM(4), %VZERO, %VMM(6)
190 vpmovmskb %VMM(6), %edx
191 testl %edx, %edx
192 jnz L(ret_vec_x4)
193
194 VMOVU %VMM(4), (VEC_SIZE * 3 + 1)(%rdi)
195
196 /* Subtract rsi from rdi before aligning. Adding back rsi will
197 get proper rdi (dst) for new src. */
198 subq %rsi, %rdi
199 incq %rsi
200 orq $(VEC_SIZE * 4 - 1), %rsi
201
202 /* Do first half of loop ahead of time so loop can just start by
203 storing. */
204 VMOVA (VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
205 VMOVA (VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
206 VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
207 VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
208
209 VPMIN %VMM(0), %VMM(1), %VMM(4)
210 VPMIN %VMM(2), %VMM(3), %VMM(6)
211 VPMIN %VMM(4), %VMM(6), %VMM(6)
212 VPCMPEQ %VMM(6), %VZERO, %VMM(6)
213 vpmovmskb %VMM(6), %edx
214 addq %rsi, %rdi
215
216 testl %edx, %edx
217 jnz L(loop_4x_done)
218
219 .p2align 4,, 11
220L(loop_4x_vec):
221
222 VMOVU %VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
223 VMOVU %VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
224 subq $(VEC_SIZE * -4), %rsi
225 VMOVU %VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
226 VMOVU %VMM(3), (VEC_SIZE * 3 + 1)(%rdi)
227
228
229 VMOVA (VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
230 VMOVA (VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
231 VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
232 VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
233
234 VPMIN %VMM(0), %VMM(1), %VMM(4)
235 VPMIN %VMM(2), %VMM(3), %VMM(6)
236 VPMIN %VMM(4), %VMM(6), %VMM(6)
237 VPCMPEQ %VMM(6), %VZERO, %VMM(6)
238
239 vpmovmskb %VMM(6), %edx
240 subq $(VEC_SIZE * -4), %rdi
241 testl %edx, %edx
242 jz L(loop_4x_vec)
243
244L(loop_4x_done):
245 VPCMPEQ %VMM(0), %VZERO, %VMM(6)
246 vpmovmskb %VMM(6), %ecx
247 testl %ecx, %ecx
248 jnz L(ret_vec_x1)
249 VMOVU %VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
250
251 VPCMPEQ %VMM(1), %VZERO, %VMM(6)
252 vpmovmskb %VMM(6), %ecx
253 testl %ecx, %ecx
254 jnz L(ret_vec_x2)
255 VMOVU %VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
256
257 VPCMPEQ %VMM(2), %VZERO, %VMM(6)
258 vpmovmskb %VMM(6), %ecx
259 testl %ecx, %ecx
260 jnz L(ret_vec_x3)
261 VMOVU %VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
262L(ret_vec_x4):
263 bsfl %edx, %edx
264 VMOVU ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
265 VMOVU %VMM(1), ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
266# ifdef USE_AS_STPCPY
267 leaq (VEC_SIZE * 3 + 1)(%rdx, %rdi), %rax
268# endif
269L(return_end):
270 VZEROUPPER_RETURN
271
272 .p2align 4,, 8
273L(ret_vec_x1):
274 bsfl %ecx, %ecx
275 VMOVU (1 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
276 VMOVU %VMM(1), (1 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
277# ifdef USE_AS_STPCPY
278 leaq 1(%rcx, %rdi), %rax
279# endif
280L(return_vzeroupper):
281 ZERO_UPPER_VEC_REGISTERS_RETURN
282
283 .p2align 4,, 8
284L(ret_vec_x2):
285 bsfl %ecx, %ecx
286 VMOVU ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
287 VMOVU %VMM(1), ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
288# ifdef USE_AS_STPCPY
289 leaq (VEC_SIZE * 1 + 1)(%rcx, %rdi), %rax
290# endif
291 VZEROUPPER_RETURN
292
293 .p2align 4,, 8
294L(ret_vec_x3):
295 bsfl %ecx, %ecx
296 VMOVU ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
297 VMOVU %VMM(1), ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
298# ifdef USE_AS_STPCPY
299 leaq (VEC_SIZE * 2 + 1)(%rcx, %rdi), %rax
300# endif
301 VZEROUPPER_RETURN
302
303
304 .p2align 4,, 4
305L(page_cross):
306 movq %rsi, %rcx
307 andq $(VEC_SIZE * -1), %rcx
308
309 VPCMPEQ (%rcx), %VZERO, %VMM(6)
310 vpmovmskb %VMM(6), %ecx
311 shrxl %esi, %ecx, %ecx
312# if USE_MOVSB_IN_PAGE_CROSS
313 /* Optimizing more aggressively for space as this is very cold
314 code. This saves 2x cache lines. */
315
316 /* This adds once to the later result which will get correct
317 copy bounds. NB: this can never zero-out a non-zero RCX as
318 to be in the page cross case rsi cannot be aligned and we
319 already right-shift rcx by the misalignment. */
320 shll $CHAR_SIZE, %ecx
321 jz L(page_cross_continue)
322 bsfl %ecx, %ecx
323# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
324 movq %rdi, %rax
325# endif
326 rep movsb
327# ifdef USE_AS_STPCPY
328 leaq -CHAR_SIZE(%rdi), %rax
329# endif
330
331 VZEROUPPER_RETURN
332
333# else
334 testl %ecx, %ecx
335 jz L(page_cross_continue)
336
337 /* Traditional copy case, essentially same as used in non-page-
338 cross case but since we can't reuse VMM(0) we need twice as
339 many loads from rsi. */
340# ifndef USE_AS_STRCAT
341 xorl %edx, %edx
342# endif
343 bsfl %ecx, %edx
344# ifdef USE_AS_STPCPY
345 leaq (%rdi, %rdx), %rax
346# elif !defined USE_AS_STRCAT
347 movq %rdi, %rax
348# endif
349
350 /* vzeroupper early to avoid duplicating at each return. */
351 COND_VZEROUPPER
352
353 testw %cx, %cx
354 jz L(page_cross_copy_16_31)
355
356 testb %cl, %cl
357 jz L(page_cross_copy_8_15)
358
359 testl $0x7, %cl
360 jz L(page_cross_copy_4_7)
361
362 testl %edx, %edx
363 jz L(page_cross_set_null_term)
364 movzwl (%rsi), %ecx
365 movw %cx, (%rdi)
366L(page_cross_set_null_term):
367 movb $0, (%END_REG)
368 ret
369
370 .p2align 4,, 4
371L(page_cross_copy_4_7):
372 movl (%rsi), %ecx
373 movl -3(%rsi, %rdx), %esi
374 movl %ecx, (%rdi)
375 movl %esi, -3(%END_REG)
376 ret
377
378 .p2align 4,, 4
379L(page_cross_copy_8_15):
380 movq (%rsi), %rcx
381 movq -7(%rsi, %rdx), %rsi
382 movq %rcx, (%rdi)
383 movq %rsi, -7(%END_REG)
384 ret
385
386
387 .p2align 4,, 3
388L(page_cross_copy_16_31):
389 VMOVU (%rsi), %xmm0
390 VMOVU -15(%rsi, %rdx), %xmm1
391 VMOVU %xmm0, (%rdi)
392 VMOVU %xmm1, -15(%END_REG)
393 ret
394# endif
395
396END(STRCPY)
397#endif
398

source code of glibc/sysdeps/x86_64/multiarch/strcpy-avx2.S