1/* strstr with unaligned loads
2 Copyright (C) 2009-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include "../strchr-isa-default-impl.h"
21
22ENTRY(__strstr_sse2_unaligned)
23 movzbl (%rsi), %eax
24 testb %al, %al
25 je L(empty)
26 movzbl 1(%rsi), %edx
27 testb %dl, %dl
28 je L(strchr)
29 movd %eax, %xmm1
30 movd %edx, %xmm2
31 movq %rdi, %rax
32 andl $4095, %eax
33 punpcklbw %xmm1, %xmm1
34 cmpq $4031, %rax
35 punpcklbw %xmm2, %xmm2
36 punpcklwd %xmm1, %xmm1
37 punpcklwd %xmm2, %xmm2
38 pshufd $0, %xmm1, %xmm1
39 pshufd $0, %xmm2, %xmm2
40 ja L(cross_page)
41 movdqu (%rdi), %xmm3
42 pxor %xmm5, %xmm5
43 movdqu 1(%rdi), %xmm4
44 movdqa %xmm3, %xmm6
45 pcmpeqb %xmm1, %xmm3
46 pcmpeqb %xmm2, %xmm4
47 movdqu 16(%rdi), %xmm0
48 pcmpeqb %xmm5, %xmm6
49 pminub %xmm4, %xmm3
50 movdqa %xmm3, %xmm4
51 movdqu 17(%rdi), %xmm3
52 pcmpeqb %xmm0, %xmm5
53 pcmpeqb %xmm2, %xmm3
54 por %xmm6, %xmm4
55 pcmpeqb %xmm1, %xmm0
56 pminub %xmm3, %xmm0
57 por %xmm5, %xmm0
58 pmovmskb %xmm4, %r8d
59 pmovmskb %xmm0, %eax
60 salq $16, %rax
61 orq %rax, %r8
62 je L(next_32_bytes)
63L(next_pair_index):
64 bsf %r8, %rax
65 addq %rdi, %rax
66 cmpb $0, (%rax)
67 je L(zero1)
68 movzbl 2(%rsi), %edx
69 testb %dl, %dl
70 je L(found1)
71 cmpb 2(%rax), %dl
72 jne L(next_pair)
73 xorl %edx, %edx
74 jmp L(pair_loop_start)
75
76 .p2align 4
77L(strchr):
78 movzbl %al, %esi
79 jmp DEFAULT_STRCHR
80
81 .p2align 4
82L(pair_loop):
83 addq $1, %rdx
84 cmpb 2(%rax,%rdx), %cl
85 jne L(next_pair)
86L(pair_loop_start):
87 movzbl 3(%rsi,%rdx), %ecx
88 testb %cl, %cl
89 jne L(pair_loop)
90L(found1):
91 ret
92L(zero1):
93 xorl %eax, %eax
94 ret
95
96 .p2align 4
97L(next_pair):
98 leaq -1(%r8), %rax
99 andq %rax, %r8
100 jne L(next_pair_index)
101
102 .p2align 4
103L(next_32_bytes):
104 movdqu 32(%rdi), %xmm3
105 pxor %xmm5, %xmm5
106 movdqu 33(%rdi), %xmm4
107 movdqa %xmm3, %xmm6
108 pcmpeqb %xmm1, %xmm3
109 pcmpeqb %xmm2, %xmm4
110 movdqu 48(%rdi), %xmm0
111 pcmpeqb %xmm5, %xmm6
112 pminub %xmm4, %xmm3
113 movdqa %xmm3, %xmm4
114 movdqu 49(%rdi), %xmm3
115 pcmpeqb %xmm0, %xmm5
116 pcmpeqb %xmm2, %xmm3
117 por %xmm6, %xmm4
118 pcmpeqb %xmm1, %xmm0
119 pminub %xmm3, %xmm0
120 por %xmm5, %xmm0
121 pmovmskb %xmm4, %eax
122 salq $32, %rax
123 pmovmskb %xmm0, %r8d
124 salq $48, %r8
125 orq %rax, %r8
126 je L(loop_header)
127L(next_pair2_index):
128 bsfq %r8, %rax
129 addq %rdi, %rax
130 cmpb $0, (%rax)
131 je L(zero2)
132 movzbl 2(%rsi), %edx
133 testb %dl, %dl
134 je L(found2)
135 cmpb 2(%rax), %dl
136 jne L(next_pair2)
137 xorl %edx, %edx
138 jmp L(pair_loop2_start)
139
140 .p2align 4
141L(pair_loop2):
142 addq $1, %rdx
143 cmpb 2(%rax,%rdx), %cl
144 jne L(next_pair2)
145L(pair_loop2_start):
146 movzbl 3(%rsi,%rdx), %ecx
147 testb %cl, %cl
148 jne L(pair_loop2)
149L(found2):
150 ret
151 L(zero2):
152 xorl %eax, %eax
153 ret
154L(empty):
155 mov %rdi, %rax
156 ret
157
158 .p2align 4
159L(next_pair2):
160 leaq -1(%r8), %rax
161 andq %rax, %r8
162 jne L(next_pair2_index)
163L(loop_header):
164 movq $-512, %r11
165 movq %rdi, %r9
166
167 pxor %xmm7, %xmm7
168 andq $-64, %rdi
169
170 .p2align 4
171L(loop):
172 movdqa 64(%rdi), %xmm3
173 movdqu 63(%rdi), %xmm6
174 movdqa %xmm3, %xmm0
175 pxor %xmm2, %xmm3
176 pxor %xmm1, %xmm6
177 movdqa 80(%rdi), %xmm10
178 por %xmm3, %xmm6
179 pminub %xmm10, %xmm0
180 movdqu 79(%rdi), %xmm3
181 pxor %xmm2, %xmm10
182 pxor %xmm1, %xmm3
183 movdqa 96(%rdi), %xmm9
184 por %xmm10, %xmm3
185 pminub %xmm9, %xmm0
186 pxor %xmm2, %xmm9
187 movdqa 112(%rdi), %xmm8
188 addq $64, %rdi
189 pminub %xmm6, %xmm3
190 movdqu 31(%rdi), %xmm4
191 pminub %xmm8, %xmm0
192 pxor %xmm2, %xmm8
193 pxor %xmm1, %xmm4
194 por %xmm9, %xmm4
195 pminub %xmm4, %xmm3
196 movdqu 47(%rdi), %xmm5
197 pxor %xmm1, %xmm5
198 por %xmm8, %xmm5
199 pminub %xmm5, %xmm3
200 pminub %xmm3, %xmm0
201 pcmpeqb %xmm7, %xmm0
202 pmovmskb %xmm0, %eax
203 testl %eax, %eax
204 je L(loop)
205 pminub (%rdi), %xmm6
206 pminub 32(%rdi),%xmm4
207 pminub 48(%rdi),%xmm5
208 pcmpeqb %xmm7, %xmm6
209 pcmpeqb %xmm7, %xmm5
210 pmovmskb %xmm6, %edx
211 movdqa 16(%rdi), %xmm8
212 pcmpeqb %xmm7, %xmm4
213 movdqu 15(%rdi), %xmm0
214 pmovmskb %xmm5, %r8d
215 movdqa %xmm8, %xmm3
216 pmovmskb %xmm4, %ecx
217 pcmpeqb %xmm1,%xmm0
218 pcmpeqb %xmm2,%xmm3
219 salq $32, %rcx
220 pcmpeqb %xmm7,%xmm8
221 salq $48, %r8
222 pminub %xmm0,%xmm3
223 orq %rcx, %rdx
224 por %xmm3,%xmm8
225 orq %rdx, %r8
226 pmovmskb %xmm8, %eax
227 salq $16, %rax
228 orq %rax, %r8
229 je L(loop)
230L(next_pair_index3):
231 bsfq %r8, %rcx
232 addq %rdi, %rcx
233 cmpb $0, (%rcx)
234 je L(zero)
235 xorl %eax, %eax
236 movzbl 2(%rsi), %edx
237 testb %dl, %dl
238 je L(success3)
239 cmpb 1(%rcx), %dl
240 jne L(next_pair3)
241 jmp L(pair_loop_start3)
242
243 .p2align 4
244L(pair_loop3):
245 addq $1, %rax
246 cmpb 1(%rcx,%rax), %dl
247 jne L(next_pair3)
248L(pair_loop_start3):
249 movzbl 3(%rsi,%rax), %edx
250 testb %dl, %dl
251 jne L(pair_loop3)
252L(success3):
253 lea -1(%rcx), %rax
254 ret
255
256 .p2align 4
257L(next_pair3):
258 addq %rax, %r11
259 movq %rdi, %rax
260 subq %r9, %rax
261 cmpq %r11, %rax
262 jl L(switch_strstr)
263 leaq -1(%r8), %rax
264 andq %rax, %r8
265 jne L(next_pair_index3)
266 jmp L(loop)
267
268 .p2align 4
269L(switch_strstr):
270 movq %rdi, %rdi
271 jmp __strstr_generic
272
273 .p2align 4
274L(cross_page):
275
276 movq %rdi, %rax
277 pxor %xmm0, %xmm0
278 andq $-64, %rax
279 movdqa (%rax), %xmm3
280 movdqu -1(%rax), %xmm4
281 movdqa %xmm3, %xmm8
282 movdqa 16(%rax), %xmm5
283 pcmpeqb %xmm1, %xmm4
284 pcmpeqb %xmm0, %xmm8
285 pcmpeqb %xmm2, %xmm3
286 movdqa %xmm5, %xmm7
287 pminub %xmm4, %xmm3
288 movdqu 15(%rax), %xmm4
289 pcmpeqb %xmm0, %xmm7
290 por %xmm3, %xmm8
291 movdqa %xmm5, %xmm3
292 movdqa 32(%rax), %xmm5
293 pcmpeqb %xmm1, %xmm4
294 pcmpeqb %xmm2, %xmm3
295 movdqa %xmm5, %xmm6
296 pmovmskb %xmm8, %ecx
297 pminub %xmm4, %xmm3
298 movdqu 31(%rax), %xmm4
299 por %xmm3, %xmm7
300 movdqa %xmm5, %xmm3
301 pcmpeqb %xmm0, %xmm6
302 movdqa 48(%rax), %xmm5
303 pcmpeqb %xmm1, %xmm4
304 pmovmskb %xmm7, %r8d
305 pcmpeqb %xmm2, %xmm3
306 pcmpeqb %xmm5, %xmm0
307 pminub %xmm4, %xmm3
308 movdqu 47(%rax), %xmm4
309 por %xmm3, %xmm6
310 movdqa %xmm5, %xmm3
311 salq $16, %r8
312 pcmpeqb %xmm1, %xmm4
313 pcmpeqb %xmm2, %xmm3
314 pmovmskb %xmm6, %r10d
315 pminub %xmm4, %xmm3
316 por %xmm3, %xmm0
317 salq $32, %r10
318 orq %r10, %r8
319 orq %rcx, %r8
320 movl %edi, %ecx
321 pmovmskb %xmm0, %edx
322 subl %eax, %ecx
323 salq $48, %rdx
324 orq %rdx, %r8
325 shrq %cl, %r8
326 je L(loop_header)
327L(next_pair_index4):
328 bsfq %r8, %rax
329 addq %rdi, %rax
330 cmpb $0, (%rax)
331 je L(zero)
332
333 cmpq %rax,%rdi
334 je L(next_pair4)
335
336 movzbl 2(%rsi), %edx
337 testb %dl, %dl
338 je L(found3)
339 cmpb 1(%rax), %dl
340 jne L(next_pair4)
341 xorl %edx, %edx
342 jmp L(pair_loop_start4)
343
344 .p2align 4
345L(pair_loop4):
346 addq $1, %rdx
347 cmpb 1(%rax,%rdx), %cl
348 jne L(next_pair4)
349L(pair_loop_start4):
350 movzbl 3(%rsi,%rdx), %ecx
351 testb %cl, %cl
352 jne L(pair_loop4)
353L(found3):
354 subq $1, %rax
355 ret
356
357 .p2align 4
358L(next_pair4):
359 leaq -1(%r8), %rax
360 andq %rax, %r8
361 jne L(next_pair_index4)
362 jmp L(loop_header)
363
364 .p2align 4
365L(found):
366 rep
367 ret
368
369 .p2align 4
370L(zero):
371 xorl %eax, %eax
372 ret
373
374
375END(__strstr_sse2_unaligned)
376

source code of glibc/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S