1/* strchr optimized with SSE2.
2 Copyright (C) 2009-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
22 so we need this to build for ISA V2 builds. */
23#if ISA_SHOULD_BUILD (2)
24
25# ifndef STRCHR
26# define STRCHR __strchr_sse2
27# endif
28
29# include <sysdep.h>
30
31 .text
32ENTRY (STRCHR)
33 movd %esi, %xmm1
34 movl %edi, %eax
35 andl $4095, %eax
36 punpcklbw %xmm1, %xmm1
37 cmpl $4032, %eax
38 punpcklwd %xmm1, %xmm1
39 pshufd $0, %xmm1, %xmm1
40 jg L(cross_page)
41 movdqu (%rdi), %xmm0
42 pxor %xmm3, %xmm3
43 movdqa %xmm0, %xmm4
44 pcmpeqb %xmm1, %xmm0
45 pcmpeqb %xmm3, %xmm4
46 por %xmm4, %xmm0
47 pmovmskb %xmm0, %eax
48 test %eax, %eax
49 je L(next_48_bytes)
50 bsf %eax, %eax
51# ifdef AS_STRCHRNUL
52 leaq (%rdi,%rax), %rax
53# else
54 movl $0, %edx
55 leaq (%rdi,%rax), %rax
56 cmpb %sil, (%rax)
57 cmovne %rdx, %rax
58# endif
59 ret
60
61 .p2align 3
62L(next_48_bytes):
63 movdqu 16(%rdi), %xmm0
64 movdqa %xmm0, %xmm4
65 pcmpeqb %xmm1, %xmm0
66 pcmpeqb %xmm3, %xmm4
67 por %xmm4, %xmm0
68 pmovmskb %xmm0, %ecx
69 movdqu 32(%rdi), %xmm0
70 movdqa %xmm0, %xmm4
71 pcmpeqb %xmm1, %xmm0
72 salq $16, %rcx
73 pcmpeqb %xmm3, %xmm4
74 por %xmm4, %xmm0
75 pmovmskb %xmm0, %eax
76 movdqu 48(%rdi), %xmm0
77 pcmpeqb %xmm0, %xmm3
78 salq $32, %rax
79 pcmpeqb %xmm1, %xmm0
80 orq %rcx, %rax
81 por %xmm3, %xmm0
82 pmovmskb %xmm0, %ecx
83 salq $48, %rcx
84 orq %rcx, %rax
85 testq %rax, %rax
86 jne L(return)
87L(loop_start):
88 /* We use this alignment to force loop be aligned to 8 but not
89 16 bytes. This gives better scheduling on AMD processors. */
90 .p2align 4
91 pxor %xmm6, %xmm6
92 andq $-64, %rdi
93 .p2align 3
94L(loop64):
95 addq $64, %rdi
96 movdqa (%rdi), %xmm5
97 movdqa 16(%rdi), %xmm2
98 movdqa 32(%rdi), %xmm3
99 pxor %xmm1, %xmm5
100 movdqa 48(%rdi), %xmm4
101 pxor %xmm1, %xmm2
102 pxor %xmm1, %xmm3
103 pminub (%rdi), %xmm5
104 pxor %xmm1, %xmm4
105 pminub 16(%rdi), %xmm2
106 pminub 32(%rdi), %xmm3
107 pminub %xmm2, %xmm5
108 pminub 48(%rdi), %xmm4
109 pminub %xmm3, %xmm5
110 pminub %xmm4, %xmm5
111 pcmpeqb %xmm6, %xmm5
112 pmovmskb %xmm5, %eax
113
114 testl %eax, %eax
115 je L(loop64)
116
117 movdqa (%rdi), %xmm5
118 movdqa %xmm5, %xmm0
119 pcmpeqb %xmm1, %xmm5
120 pcmpeqb %xmm6, %xmm0
121 por %xmm0, %xmm5
122 pcmpeqb %xmm6, %xmm2
123 pcmpeqb %xmm6, %xmm3
124 pcmpeqb %xmm6, %xmm4
125
126 pmovmskb %xmm5, %ecx
127 pmovmskb %xmm2, %eax
128 salq $16, %rax
129 pmovmskb %xmm3, %r8d
130 pmovmskb %xmm4, %edx
131 salq $32, %r8
132 orq %r8, %rax
133 orq %rcx, %rax
134 salq $48, %rdx
135 orq %rdx, %rax
136 .p2align 3
137L(return):
138 bsfq %rax, %rax
139# ifdef AS_STRCHRNUL
140 leaq (%rdi,%rax), %rax
141# else
142 movl $0, %edx
143 leaq (%rdi,%rax), %rax
144 cmpb %sil, (%rax)
145 cmovne %rdx, %rax
146# endif
147 ret
148 .p2align 4
149
150L(cross_page):
151 movq %rdi, %rdx
152 pxor %xmm2, %xmm2
153 andq $-64, %rdx
154 movdqa %xmm1, %xmm0
155 movdqa (%rdx), %xmm3
156 movdqa %xmm3, %xmm4
157 pcmpeqb %xmm1, %xmm3
158 pcmpeqb %xmm2, %xmm4
159 por %xmm4, %xmm3
160 pmovmskb %xmm3, %r8d
161 movdqa 16(%rdx), %xmm3
162 movdqa %xmm3, %xmm4
163 pcmpeqb %xmm1, %xmm3
164 pcmpeqb %xmm2, %xmm4
165 por %xmm4, %xmm3
166 pmovmskb %xmm3, %eax
167 movdqa 32(%rdx), %xmm3
168 movdqa %xmm3, %xmm4
169 pcmpeqb %xmm1, %xmm3
170 salq $16, %rax
171 pcmpeqb %xmm2, %xmm4
172 por %xmm4, %xmm3
173 pmovmskb %xmm3, %r9d
174 movdqa 48(%rdx), %xmm3
175 pcmpeqb %xmm3, %xmm2
176 salq $32, %r9
177 pcmpeqb %xmm3, %xmm0
178 orq %r9, %rax
179 orq %r8, %rax
180 por %xmm2, %xmm0
181 pmovmskb %xmm0, %ecx
182 salq $48, %rcx
183 orq %rcx, %rax
184 movl %edi, %ecx
185 subb %dl, %cl
186 shrq %cl, %rax
187 testq %rax, %rax
188 jne L(return)
189 jmp L(loop_start)
190
191END (STRCHR)
192#endif
193

source code of glibc/sysdeps/x86_64/multiarch/strchr-sse2.S