1/* strlen optimized with SSE2.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21/* ISA level >= 2 for both strlen and wcslen. wcslen uses `pminud`
22 which is SSE4.1. strlen doesn't have an ISA level == 2
23 implementation so the SSE2 implementation must be built with ISA
24 level == 2. */
25# if ISA_SHOULD_BUILD (2)
26
27# include <sysdep.h>
28
29# ifndef STRLEN
30# define STRLEN __strlen_sse2
31# endif
32
33# ifdef AS_WCSLEN
34# define PMINU pminud
35# define PCMPEQ pcmpeqd
36# define SHIFT_RETURN shrq $2, %rax
37# else
38# define PMINU pminub
39# define PCMPEQ pcmpeqb
40# define SHIFT_RETURN
41# endif
42
43# ifndef SECTION
44# define SECTION(p) p
45# endif
46
47/* Long lived register in strlen(s), strnlen(s, n) are:
48
49 %xmm3 - zero
50 %rdi - s
51 %r10 (s+n) & (~(64-1))
52 %r11 s+n
53*/
54
55
56 .section SECTION(.text),"ax",@progbits
57ENTRY(STRLEN)
58
59/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
60# define FIND_ZERO \
61 PCMPEQ (%rax), %xmm0; \
62 PCMPEQ 16(%rax), %xmm1; \
63 PCMPEQ 32(%rax), %xmm2; \
64 PCMPEQ 48(%rax), %xmm3; \
65 pmovmskb %xmm0, %esi; \
66 pmovmskb %xmm1, %edx; \
67 pmovmskb %xmm2, %r8d; \
68 pmovmskb %xmm3, %ecx; \
69 salq $16, %rdx; \
70 salq $16, %rcx; \
71 orq %rsi, %rdx; \
72 orq %r8, %rcx; \
73 salq $32, %rcx; \
74 orq %rcx, %rdx;
75
76# ifdef AS_STRNLEN
77/* Do not read anything when n==0. */
78 test %RSI_LP, %RSI_LP
79 jne L(n_nonzero)
80 xor %rax, %rax
81 ret
82L(n_nonzero):
83# ifdef AS_WCSLEN
84/* Check for overflow from maxlen * sizeof(wchar_t). If it would
85 overflow the only way this program doesn't have undefined behavior
86 is if there is a null terminator in valid memory so wcslen will
87 suffice. */
88 mov %RSI_LP, %R10_LP
89 sar $62, %R10_LP
90 jnz OVERFLOW_STRLEN
91 sal $2, %RSI_LP
92# endif
93
94/* Initialize long lived registers. */
95 add %RDI_LP, %RSI_LP
96 mov %RSI_LP, %R10_LP
97 and $-64, %R10_LP
98 mov %RSI_LP, %R11_LP
99# endif
100
101 pxor %xmm0, %xmm0
102 pxor %xmm1, %xmm1
103 pxor %xmm2, %xmm2
104 pxor %xmm3, %xmm3
105 movq %rdi, %rax
106 movq %rdi, %rcx
107 andq $4095, %rcx
108/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
109 cmpq $4047, %rcx
110/* We cannot unify this branching as it would be ~6 cycles slower. */
111 ja L(cross_page)
112
113# ifdef AS_STRNLEN
114/* Test if end is among first 64 bytes. */
115# define STRNLEN_PROLOG \
116 mov %r11, %rsi; \
117 subq %rax, %rsi; \
118 andq $-64, %rax; \
119 testq $-64, %rsi; \
120 je L(strnlen_ret)
121# else
122# define STRNLEN_PROLOG andq $-64, %rax;
123# endif
124
125/* Ignore bits in mask that come before start of string. */
126# define PROLOG(lab) \
127 movq %rdi, %rcx; \
128 xorq %rax, %rcx; \
129 STRNLEN_PROLOG; \
130 sarq %cl, %rdx; \
131 test %rdx, %rdx; \
132 je L(lab); \
133 bsfq %rdx, %rax; \
134 SHIFT_RETURN; \
135 ret
136
137# ifdef AS_STRNLEN
138 andq $-16, %rax
139 FIND_ZERO
140# else
141 /* Test first 16 bytes unaligned. */
142 movdqu (%rax), %xmm4
143 PCMPEQ %xmm0, %xmm4
144 pmovmskb %xmm4, %edx
145 test %edx, %edx
146 je L(next48_bytes)
147 bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
148 SHIFT_RETURN
149 ret
150
151L(next48_bytes):
152/* Same as FIND_ZERO except we do not check first 16 bytes. */
153 andq $-16, %rax
154 PCMPEQ 16(%rax), %xmm1
155 PCMPEQ 32(%rax), %xmm2
156 PCMPEQ 48(%rax), %xmm3
157 pmovmskb %xmm1, %edx
158 pmovmskb %xmm2, %r8d
159 pmovmskb %xmm3, %ecx
160 salq $16, %rdx
161 salq $16, %rcx
162 orq %r8, %rcx
163 salq $32, %rcx
164 orq %rcx, %rdx
165# endif
166
167 /* When no zero byte is found xmm1-3 are zero so we do not have to
168 zero them. */
169 PROLOG(loop)
170
171 .p2align 4
172L(cross_page):
173 andq $-64, %rax
174 FIND_ZERO
175 PROLOG(loop_init)
176
177# ifdef AS_STRNLEN
178/* We must do this check to correctly handle strnlen (s, -1). */
179L(strnlen_ret):
180 bts %rsi, %rdx
181 sarq %cl, %rdx
182 test %rdx, %rdx
183 je L(loop_init)
184 bsfq %rdx, %rax
185 SHIFT_RETURN
186 ret
187# endif
188 .p2align 4
189L(loop_init):
190 pxor %xmm1, %xmm1
191 pxor %xmm2, %xmm2
192 pxor %xmm3, %xmm3
193# ifdef AS_STRNLEN
194 .p2align 4
195L(loop):
196
197 addq $64, %rax
198 cmpq %rax, %r10
199 je L(exit_end)
200
201 movdqa (%rax), %xmm0
202 PMINU 16(%rax), %xmm0
203 PMINU 32(%rax), %xmm0
204 PMINU 48(%rax), %xmm0
205 PCMPEQ %xmm3, %xmm0
206 pmovmskb %xmm0, %edx
207 testl %edx, %edx
208 jne L(exit)
209 jmp L(loop)
210
211 .p2align 4
212L(exit_end):
213 cmp %rax, %r11
214 je L(first) /* Do not read when end is at page boundary. */
215 pxor %xmm0, %xmm0
216 FIND_ZERO
217
218L(first):
219 bts %r11, %rdx
220 bsfq %rdx, %rdx
221 addq %rdx, %rax
222 subq %rdi, %rax
223 SHIFT_RETURN
224 ret
225
226 .p2align 4
227L(exit):
228 pxor %xmm0, %xmm0
229 FIND_ZERO
230
231 bsfq %rdx, %rdx
232 addq %rdx, %rax
233 subq %rdi, %rax
234 SHIFT_RETURN
235 ret
236
237# else
238
239 /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
240 .p2align 4
241L(loop):
242
243 movdqa 64(%rax), %xmm0
244 PMINU 80(%rax), %xmm0
245 PMINU 96(%rax), %xmm0
246 PMINU 112(%rax), %xmm0
247 PCMPEQ %xmm3, %xmm0
248 pmovmskb %xmm0, %edx
249 testl %edx, %edx
250 jne L(exit64)
251
252 subq $-128, %rax
253
254 movdqa (%rax), %xmm0
255 PMINU 16(%rax), %xmm0
256 PMINU 32(%rax), %xmm0
257 PMINU 48(%rax), %xmm0
258 PCMPEQ %xmm3, %xmm0
259 pmovmskb %xmm0, %edx
260 testl %edx, %edx
261 jne L(exit0)
262 jmp L(loop)
263
264 .p2align 4
265L(exit64):
266 addq $64, %rax
267L(exit0):
268 pxor %xmm0, %xmm0
269 FIND_ZERO
270
271 bsfq %rdx, %rdx
272 addq %rdx, %rax
273 subq %rdi, %rax
274 SHIFT_RETURN
275 ret
276
277# endif
278
279END(STRLEN)
280#endif
281

source code of glibc/sysdeps/x86_64/multiarch/strlen-sse2.S