1/* wcslen optimized with SSE2.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (1)
22
23# include <sysdep.h>
24
25#ifndef WCSLEN
26# define WCSLEN __wcslen_sse2
27#endif
28
29
30 .text
31ENTRY (WCSLEN)
32 cmpl $0, (%rdi)
33 jz L(exit_tail0)
34 cmpl $0, 4(%rdi)
35 jz L(exit_tail1)
36 cmpl $0, 8(%rdi)
37 jz L(exit_tail2)
38 cmpl $0, 12(%rdi)
39 jz L(exit_tail3)
40 cmpl $0, 16(%rdi)
41 jz L(exit_tail4)
42 cmpl $0, 20(%rdi)
43 jz L(exit_tail5)
44 cmpl $0, 24(%rdi)
45 jz L(exit_tail6)
46 cmpl $0, 28(%rdi)
47 jz L(exit_tail7)
48
49 pxor %xmm0, %xmm0
50
51 lea 32(%rdi), %rax
52 addq $16, %rdi
53 and $-16, %rax
54
55 pcmpeqd (%rax), %xmm0
56 pmovmskb %xmm0, %edx
57 pxor %xmm1, %xmm1
58 addq $16, %rax
59 test %edx, %edx
60 jnz L(exit)
61
62 pcmpeqd (%rax), %xmm1
63 pmovmskb %xmm1, %edx
64 pxor %xmm2, %xmm2
65 addq $16, %rax
66 test %edx, %edx
67 jnz L(exit)
68
69 pcmpeqd (%rax), %xmm2
70 pmovmskb %xmm2, %edx
71 pxor %xmm3, %xmm3
72 addq $16, %rax
73 test %edx, %edx
74 jnz L(exit)
75
76 pcmpeqd (%rax), %xmm3
77 pmovmskb %xmm3, %edx
78 addq $16, %rax
79 test %edx, %edx
80 jnz L(exit)
81
82 pcmpeqd (%rax), %xmm0
83 pmovmskb %xmm0, %edx
84 addq $16, %rax
85 test %edx, %edx
86 jnz L(exit)
87
88 pcmpeqd (%rax), %xmm1
89 pmovmskb %xmm1, %edx
90 addq $16, %rax
91 test %edx, %edx
92 jnz L(exit)
93
94 pcmpeqd (%rax), %xmm2
95 pmovmskb %xmm2, %edx
96 addq $16, %rax
97 test %edx, %edx
98 jnz L(exit)
99
100 pcmpeqd (%rax), %xmm3
101 pmovmskb %xmm3, %edx
102 addq $16, %rax
103 test %edx, %edx
104 jnz L(exit)
105
106 pcmpeqd (%rax), %xmm0
107 pmovmskb %xmm0, %edx
108 addq $16, %rax
109 test %edx, %edx
110 jnz L(exit)
111
112 pcmpeqd (%rax), %xmm1
113 pmovmskb %xmm1, %edx
114 addq $16, %rax
115 test %edx, %edx
116 jnz L(exit)
117
118 pcmpeqd (%rax), %xmm2
119 pmovmskb %xmm2, %edx
120 addq $16, %rax
121 test %edx, %edx
122 jnz L(exit)
123
124 pcmpeqd (%rax), %xmm3
125 pmovmskb %xmm3, %edx
126 addq $16, %rax
127 test %edx, %edx
128 jnz L(exit)
129
130 and $-0x40, %rax
131
132 .p2align 4
133L(aligned_64_loop):
134 movaps (%rax), %xmm0
135 movaps 16(%rax), %xmm1
136 movaps 32(%rax), %xmm2
137 movaps 48(%rax), %xmm6
138
139 pminub %xmm1, %xmm0
140 pminub %xmm6, %xmm2
141 pminub %xmm0, %xmm2
142 pcmpeqd %xmm3, %xmm2
143 pmovmskb %xmm2, %edx
144 addq $64, %rax
145 test %edx, %edx
146 jz L(aligned_64_loop)
147
148 pcmpeqd -64(%rax), %xmm3
149 pmovmskb %xmm3, %edx
150 addq $48, %rdi
151 test %edx, %edx
152 jnz L(exit)
153
154 pcmpeqd %xmm1, %xmm3
155 pmovmskb %xmm3, %edx
156 addq $-16, %rdi
157 test %edx, %edx
158 jnz L(exit)
159
160 pcmpeqd -32(%rax), %xmm3
161 pmovmskb %xmm3, %edx
162 addq $-16, %rdi
163 test %edx, %edx
164 jnz L(exit)
165
166 pcmpeqd %xmm6, %xmm3
167 pmovmskb %xmm3, %edx
168 addq $-16, %rdi
169 test %edx, %edx
170 jz L(aligned_64_loop)
171
172 .p2align 4
173L(exit):
174 sub %rdi, %rax
175 shr $2, %rax
176 test %dl, %dl
177 jz L(exit_high)
178
179 andl $15, %edx
180 jz L(exit_1)
181 ret
182
183 /* No align here. Naturally aligned % 16 == 1. */
184L(exit_high):
185 andl $(15 << 8), %edx
186 jz L(exit_3)
187 add $2, %rax
188 ret
189
190 .p2align 3
191L(exit_1):
192 add $1, %rax
193 ret
194
195 .p2align 3
196L(exit_3):
197 add $3, %rax
198 ret
199
200 .p2align 3
201L(exit_tail0):
202 xorl %eax, %eax
203 ret
204
205 .p2align 3
206L(exit_tail1):
207 movl $1, %eax
208 ret
209
210 .p2align 3
211L(exit_tail2):
212 movl $2, %eax
213 ret
214
215 .p2align 3
216L(exit_tail3):
217 movl $3, %eax
218 ret
219
220 .p2align 3
221L(exit_tail4):
222 movl $4, %eax
223 ret
224
225 .p2align 3
226L(exit_tail5):
227 movl $5, %eax
228 ret
229
230 .p2align 3
231L(exit_tail6):
232 movl $6, %eax
233 ret
234
235 .p2align 3
236L(exit_tail7):
237 movl $7, %eax
238 ret
239
240END (WCSLEN)
241
242#endif
243

source code of glibc/sysdeps/x86_64/multiarch/wcslen-sse2.S