1/* strcat with SSE2
2 Copyright (C) 2011-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
22 so we need this to build for ISA V2 builds. */
23#if ISA_SHOULD_BUILD (2)
24
25
26# include <sysdep.h>
27
28# ifndef STRCAT
29# define STRCAT __strcat_sse2_unaligned
30# endif
31
32# define USE_AS_STRCAT
33
34.text
35ENTRY (STRCAT)
36 mov %rdi, %r9
37# ifdef USE_AS_STRNCAT
38# ifdef __ILP32__
39 /* Clear the upper 32 bits. */
40 movl %edx, %edx
41# endif
42 mov %rdx, %r8
43# endif
44
45/* Inline corresponding strlen file, temporary until new strcpy
46 implementation gets merged. */
47
48 xor %rax, %rax
49 mov %edi, %ecx
50 and $0x3f, %ecx
51 pxor %xmm0, %xmm0
52 cmp $0x30, %ecx
53 ja L(next)
54 movdqu (%rdi), %xmm1
55 pcmpeqb %xmm1, %xmm0
56 pmovmskb %xmm0, %edx
57 test %edx, %edx
58 jnz L(exit_less16)
59 mov %rdi, %rax
60 and $-16, %rax
61 jmp L(align16_start)
62L(next):
63 mov %rdi, %rax
64 and $-16, %rax
65 pcmpeqb (%rax), %xmm0
66 mov $-1, %r10d
67 sub %rax, %rcx
68 shl %cl, %r10d
69 pmovmskb %xmm0, %edx
70 and %r10d, %edx
71 jnz L(exit)
72
73L(align16_start):
74 pxor %xmm0, %xmm0
75 pxor %xmm1, %xmm1
76 pxor %xmm2, %xmm2
77 pxor %xmm3, %xmm3
78 pcmpeqb 16(%rax), %xmm0
79 pmovmskb %xmm0, %edx
80 test %edx, %edx
81 jnz L(exit16)
82
83 pcmpeqb 32(%rax), %xmm1
84 pmovmskb %xmm1, %edx
85 test %edx, %edx
86 jnz L(exit32)
87
88 pcmpeqb 48(%rax), %xmm2
89 pmovmskb %xmm2, %edx
90 test %edx, %edx
91 jnz L(exit48)
92
93 pcmpeqb 64(%rax), %xmm3
94 pmovmskb %xmm3, %edx
95 test %edx, %edx
96 jnz L(exit64)
97
98 pcmpeqb 80(%rax), %xmm0
99 add $64, %rax
100 pmovmskb %xmm0, %edx
101 test %edx, %edx
102 jnz L(exit16)
103
104 pcmpeqb 32(%rax), %xmm1
105 pmovmskb %xmm1, %edx
106 test %edx, %edx
107 jnz L(exit32)
108
109 pcmpeqb 48(%rax), %xmm2
110 pmovmskb %xmm2, %edx
111 test %edx, %edx
112 jnz L(exit48)
113
114 pcmpeqb 64(%rax), %xmm3
115 pmovmskb %xmm3, %edx
116 test %edx, %edx
117 jnz L(exit64)
118
119 pcmpeqb 80(%rax), %xmm0
120 add $64, %rax
121 pmovmskb %xmm0, %edx
122 test %edx, %edx
123 jnz L(exit16)
124
125 pcmpeqb 32(%rax), %xmm1
126 pmovmskb %xmm1, %edx
127 test %edx, %edx
128 jnz L(exit32)
129
130 pcmpeqb 48(%rax), %xmm2
131 pmovmskb %xmm2, %edx
132 test %edx, %edx
133 jnz L(exit48)
134
135 pcmpeqb 64(%rax), %xmm3
136 pmovmskb %xmm3, %edx
137 test %edx, %edx
138 jnz L(exit64)
139
140 pcmpeqb 80(%rax), %xmm0
141 add $64, %rax
142 pmovmskb %xmm0, %edx
143 test %edx, %edx
144 jnz L(exit16)
145
146 pcmpeqb 32(%rax), %xmm1
147 pmovmskb %xmm1, %edx
148 test %edx, %edx
149 jnz L(exit32)
150
151 pcmpeqb 48(%rax), %xmm2
152 pmovmskb %xmm2, %edx
153 test %edx, %edx
154 jnz L(exit48)
155
156 pcmpeqb 64(%rax), %xmm3
157 pmovmskb %xmm3, %edx
158 test %edx, %edx
159 jnz L(exit64)
160
161 test $0x3f, %rax
162 jz L(align64_loop)
163
164 pcmpeqb 80(%rax), %xmm0
165 add $80, %rax
166 pmovmskb %xmm0, %edx
167 test %edx, %edx
168 jnz L(exit)
169
170 test $0x3f, %rax
171 jz L(align64_loop)
172
173 pcmpeqb 16(%rax), %xmm1
174 add $16, %rax
175 pmovmskb %xmm1, %edx
176 test %edx, %edx
177 jnz L(exit)
178
179 test $0x3f, %rax
180 jz L(align64_loop)
181
182 pcmpeqb 16(%rax), %xmm2
183 add $16, %rax
184 pmovmskb %xmm2, %edx
185 test %edx, %edx
186 jnz L(exit)
187
188 test $0x3f, %rax
189 jz L(align64_loop)
190
191 pcmpeqb 16(%rax), %xmm3
192 add $16, %rax
193 pmovmskb %xmm3, %edx
194 test %edx, %edx
195 jnz L(exit)
196
197 add $16, %rax
198 .p2align 4
199 L(align64_loop):
200 movaps (%rax), %xmm4
201 pminub 16(%rax), %xmm4
202 movaps 32(%rax), %xmm5
203 pminub 48(%rax), %xmm5
204 add $64, %rax
205 pminub %xmm4, %xmm5
206 pcmpeqb %xmm0, %xmm5
207 pmovmskb %xmm5, %edx
208 test %edx, %edx
209 jz L(align64_loop)
210
211 pcmpeqb -64(%rax), %xmm0
212 sub $80, %rax
213 pmovmskb %xmm0, %edx
214 test %edx, %edx
215 jnz L(exit16)
216
217 pcmpeqb 32(%rax), %xmm1
218 pmovmskb %xmm1, %edx
219 test %edx, %edx
220 jnz L(exit32)
221
222 pcmpeqb 48(%rax), %xmm2
223 pmovmskb %xmm2, %edx
224 test %edx, %edx
225 jnz L(exit48)
226
227 pcmpeqb 64(%rax), %xmm3
228 pmovmskb %xmm3, %edx
229 sub %rdi, %rax
230 bsf %rdx, %rdx
231 add %rdx, %rax
232 add $64, %rax
233 jmp L(StartStrcpyPart)
234
235 .p2align 4
236L(exit):
237 sub %rdi, %rax
238L(exit_less16):
239 bsf %rdx, %rdx
240 add %rdx, %rax
241 jmp L(StartStrcpyPart)
242
243 .p2align 4
244L(exit16):
245 sub %rdi, %rax
246 bsf %rdx, %rdx
247 add %rdx, %rax
248 add $16, %rax
249 jmp L(StartStrcpyPart)
250
251 .p2align 4
252L(exit32):
253 sub %rdi, %rax
254 bsf %rdx, %rdx
255 add %rdx, %rax
256 add $32, %rax
257 jmp L(StartStrcpyPart)
258
259 .p2align 4
260L(exit48):
261 sub %rdi, %rax
262 bsf %rdx, %rdx
263 add %rdx, %rax
264 add $48, %rax
265 jmp L(StartStrcpyPart)
266
267 .p2align 4
268L(exit64):
269 sub %rdi, %rax
270 bsf %rdx, %rdx
271 add %rdx, %rax
272 add $64, %rax
273
274 .p2align 4
275L(StartStrcpyPart):
276 lea (%r9, %rax), %rdi
277 mov %rsi, %rcx
278 mov %r9, %rax /* save result */
279
280# ifdef USE_AS_STRNCAT
281 test %r8, %r8
282 jz L(ExitZero)
283# define USE_AS_STRNCPY
284# endif
285
286# include "strcpy-sse2-unaligned.S"
287#endif
288

source code of glibc/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S