1 | /* Implementation for strrchr using evex256 and evex512. |
2 | Copyright (C) 2022-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <isa-level.h> |
20 | |
21 | #if ISA_SHOULD_BUILD (4) |
22 | |
23 | # include <sysdep.h> |
24 | |
25 | # ifdef USE_AS_WCSRCHR |
26 | # if VEC_SIZE == 64 |
27 | # define RCX_M cx |
28 | # define KORTEST_M kortestw |
29 | # else |
30 | # define RCX_M cl |
31 | # define KORTEST_M kortestb |
32 | # endif |
33 | |
34 | # define SHIFT_REG VRCX |
35 | # define CHAR_SIZE 4 |
36 | # define VPCMP vpcmpd |
37 | # define VPMIN vpminud |
38 | # define VPTESTN vptestnmd |
39 | # define VPTEST vptestmd |
40 | # define VPBROADCAST vpbroadcastd |
41 | # define VPCMPEQ vpcmpeqd |
42 | |
43 | # else |
44 | # if VEC_SIZE == 64 |
45 | # define SHIFT_REG VRCX |
46 | # else |
47 | # define SHIFT_REG VRDI |
48 | # endif |
49 | # define CHAR_SIZE 1 |
50 | # define VPCMP vpcmpb |
51 | # define VPMIN vpminub |
52 | # define VPTESTN vptestnmb |
53 | # define VPTEST vptestmb |
54 | # define VPBROADCAST vpbroadcastb |
55 | # define VPCMPEQ vpcmpeqb |
56 | |
57 | # define RCX_M VRCX |
58 | # define KORTEST_M KORTEST |
59 | # endif |
60 | |
61 | # if VEC_SIZE == 32 || (defined USE_AS_WCSRCHR) |
62 | # define SHIFT_R(cnt, val) shrx cnt, val, val |
63 | # else |
64 | # define SHIFT_R(cnt, val) shr %cl, val |
65 | # endif |
66 | |
67 | # define VMATCH VMM(0) |
68 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
69 | # define PAGE_SIZE 4096 |
70 | |
71 | .section SECTION(.text), "ax" , @progbits |
72 | /* Aligning entry point to 64 byte, provides better performance for |
73 | one vector length string. */ |
74 | ENTRY_P2ALIGN(STRRCHR, 6) |
75 | movl %edi, %eax |
76 | /* Broadcast CHAR to VMATCH. */ |
77 | VPBROADCAST %esi, %VMATCH |
78 | |
79 | andl $(PAGE_SIZE - 1), %eax |
80 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
81 | jg L(cross_page_boundary) |
82 | L(page_cross_continue): |
83 | VMOVU (%rdi), %VMM(1) |
84 | /* k0 has a 1 for each zero CHAR in YMM1. */ |
85 | VPTESTN %VMM(1), %VMM(1), %k0 |
86 | KMOV %k0, %VGPR(rsi) |
87 | test %VGPR(rsi), %VGPR(rsi) |
88 | jz L(aligned_more) |
89 | /* fallthrough: zero CHAR in first VEC. */ |
90 | |
91 | /* K1 has a 1 for each search CHAR match in VEC(1). */ |
92 | VPCMPEQ %VMATCH, %VMM(1), %k1 |
93 | KMOV %k1, %VGPR(rax) |
94 | /* Build mask up until first zero CHAR (used to mask of |
95 | potential search CHAR matches past the end of the string). */ |
96 | blsmsk %VGPR(rsi), %VGPR(rsi) |
97 | /* Use `and` here to remove any out of bounds matches so we can |
98 | do a reverse scan on `rax` to find the last match. */ |
99 | and %VGPR(rsi), %VGPR(rax) |
100 | jz L(ret0) |
101 | /* Get last match. */ |
102 | bsr %VGPR(rax), %VGPR(rax) |
103 | # ifdef USE_AS_WCSRCHR |
104 | leaq (%rdi, %rax, CHAR_SIZE), %rax |
105 | # else |
106 | addq %rdi, %rax |
107 | # endif |
108 | L(ret0): |
109 | ret |
110 | |
111 | /* Returns for first vec x1/x2/x3 have hard coded backward |
112 | search path for earlier matches. */ |
113 | .p2align 4,, 6 |
114 | L(first_vec_x1): |
115 | VPCMPEQ %VMATCH, %VMM(2), %k1 |
116 | KMOV %k1, %VGPR(rax) |
117 | blsmsk %VGPR(rcx), %VGPR(rcx) |
118 | /* eax non-zero if search CHAR in range. */ |
119 | and %VGPR(rcx), %VGPR(rax) |
120 | jnz L(first_vec_x1_return) |
121 | |
122 | /* fallthrough: no match in YMM2 then need to check for earlier |
123 | matches (in YMM1). */ |
124 | .p2align 4,, 4 |
125 | L(first_vec_x0_test): |
126 | VPCMPEQ %VMATCH, %VMM(1), %k1 |
127 | KMOV %k1, %VGPR(rax) |
128 | test %VGPR(rax), %VGPR(rax) |
129 | jz L(ret1) |
130 | bsr %VGPR(rax), %VGPR(rax) |
131 | # ifdef USE_AS_WCSRCHR |
132 | leaq (%rsi, %rax, CHAR_SIZE), %rax |
133 | # else |
134 | addq %rsi, %rax |
135 | # endif |
136 | L(ret1): |
137 | ret |
138 | |
139 | .p2align 4,, 10 |
140 | L(first_vec_x3): |
141 | VPCMPEQ %VMATCH, %VMM(4), %k1 |
142 | KMOV %k1, %VGPR(rax) |
143 | blsmsk %VGPR(rcx), %VGPR(rcx) |
144 | /* If no search CHAR match in range check YMM1/YMM2/YMM3. */ |
145 | and %VGPR(rcx), %VGPR(rax) |
146 | jz L(first_vec_x1_or_x2) |
147 | bsr %VGPR(rax), %VGPR(rax) |
148 | leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax |
149 | ret |
150 | .p2align 4,, 4 |
151 | |
152 | L(first_vec_x2): |
153 | VPCMPEQ %VMATCH, %VMM(3), %k1 |
154 | KMOV %k1, %VGPR(rax) |
155 | blsmsk %VGPR(rcx), %VGPR(rcx) |
156 | /* Check YMM3 for last match first. If no match try YMM2/YMM1. */ |
157 | and %VGPR(rcx), %VGPR(rax) |
158 | jz L(first_vec_x0_x1_test) |
159 | bsr %VGPR(rax), %VGPR(rax) |
160 | leaq (VEC_SIZE * 2)(%r8, %rax, CHAR_SIZE), %rax |
161 | ret |
162 | |
163 | .p2align 4,, 6 |
164 | L(first_vec_x0_x1_test): |
165 | VPCMPEQ %VMATCH, %VMM(2), %k1 |
166 | KMOV %k1, %VGPR(rax) |
167 | /* Check YMM2 for last match first. If no match try YMM1. */ |
168 | test %VGPR(rax), %VGPR(rax) |
169 | jz L(first_vec_x0_test) |
170 | .p2align 4,, 4 |
171 | L(first_vec_x1_return): |
172 | bsr %VGPR(rax), %VGPR(rax) |
173 | leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax |
174 | ret |
175 | |
176 | .p2align 4,, 12 |
177 | L(aligned_more): |
178 | /* Need to keep original pointer incase VEC(1) has last match. */ |
179 | movq %rdi, %rsi |
180 | andq $-VEC_SIZE, %rdi |
181 | |
182 | VMOVU VEC_SIZE(%rdi), %VMM(2) |
183 | VPTESTN %VMM(2), %VMM(2), %k0 |
184 | KMOV %k0, %VRCX |
185 | movq %rdi, %r8 |
186 | test %VRCX, %VRCX |
187 | jnz L(first_vec_x1) |
188 | |
189 | VMOVU (VEC_SIZE * 2)(%rdi), %VMM(3) |
190 | VPTESTN %VMM(3), %VMM(3), %k0 |
191 | KMOV %k0, %VRCX |
192 | |
193 | test %VRCX, %VRCX |
194 | jnz L(first_vec_x2) |
195 | |
196 | VMOVU (VEC_SIZE * 3)(%rdi), %VMM(4) |
197 | VPTESTN %VMM(4), %VMM(4), %k0 |
198 | KMOV %k0, %VRCX |
199 | |
200 | /* Intentionally use 64-bit here. EVEX256 version needs 1-byte |
201 | padding for efficient nop before loop alignment. */ |
202 | test %rcx, %rcx |
203 | jnz L(first_vec_x3) |
204 | |
205 | andq $-(VEC_SIZE * 2), %rdi |
206 | .p2align 4 |
207 | L(first_aligned_loop): |
208 | /* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can |
209 | gurantee they don't store a match. */ |
210 | VMOVA (VEC_SIZE * 4)(%rdi), %VMM(5) |
211 | VMOVA (VEC_SIZE * 5)(%rdi), %VMM(6) |
212 | |
213 | VPCMP $4, %VMM(5), %VMATCH, %k2 |
214 | VPCMP $4, %VMM(6), %VMATCH, %k3{%k2} |
215 | |
216 | VPMIN %VMM(5), %VMM(6), %VMM(7) |
217 | |
218 | VPTEST %VMM(7), %VMM(7), %k1{%k3} |
219 | subq $(VEC_SIZE * -2), %rdi |
220 | KORTEST_M %k1, %k1 |
221 | jc L(first_aligned_loop) |
222 | |
223 | VPTESTN %VMM(7), %VMM(7), %k1 |
224 | KMOV %k1, %VRDX |
225 | test %VRDX, %VRDX |
226 | jz L(second_aligned_loop_prep) |
227 | |
228 | KORTEST_M %k3, %k3 |
229 | jnc L(return_first_aligned_loop) |
230 | |
231 | .p2align 4,, 6 |
232 | L(first_vec_x1_or_x2_or_x3): |
233 | VPCMPEQ %VMM(4), %VMATCH, %k4 |
234 | KMOV %k4, %VRAX |
235 | test %VRAX, %VRAX |
236 | jz L(first_vec_x1_or_x2) |
237 | bsr %VRAX, %VRAX |
238 | leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax |
239 | ret |
240 | |
241 | .p2align 4,, 8 |
242 | L(return_first_aligned_loop): |
243 | VPTESTN %VMM(5), %VMM(5), %k0 |
244 | KMOV %k0, %VRCX |
245 | blsmsk %VRCX, %VRCX |
246 | jnc L(return_first_new_match_first) |
247 | blsmsk %VRDX, %VRDX |
248 | VPCMPEQ %VMM(6), %VMATCH, %k0 |
249 | KMOV %k0, %VRAX |
250 | addq $VEC_SIZE, %rdi |
251 | and %VRDX, %VRAX |
252 | jnz L(return_first_new_match_ret) |
253 | subq $VEC_SIZE, %rdi |
254 | L(return_first_new_match_first): |
255 | KMOV %k2, %VRAX |
256 | # ifdef USE_AS_WCSRCHR |
257 | xorl $((1 << CHAR_PER_VEC)- 1), %VRAX |
258 | and %VRCX, %VRAX |
259 | # else |
260 | andn %VRCX, %VRAX, %VRAX |
261 | # endif |
262 | jz L(first_vec_x1_or_x2_or_x3) |
263 | L(return_first_new_match_ret): |
264 | bsr %VRAX, %VRAX |
265 | leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax |
266 | ret |
267 | |
268 | .p2align 4,, 10 |
269 | L(first_vec_x1_or_x2): |
270 | VPCMPEQ %VMM(3), %VMATCH, %k3 |
271 | KMOV %k3, %VRAX |
272 | test %VRAX, %VRAX |
273 | jz L(first_vec_x0_x1_test) |
274 | bsr %VRAX, %VRAX |
275 | leaq (VEC_SIZE * 2)(%r8, %rax, CHAR_SIZE), %rax |
276 | ret |
277 | |
278 | .p2align 4 |
279 | /* We can throw away the work done for the first 4x checks here |
280 | as we have a later match. This is the 'fast' path persay. */ |
281 | L(second_aligned_loop_prep): |
282 | L(second_aligned_loop_set_furthest_match): |
283 | movq %rdi, %rsi |
284 | VMOVA %VMM(5), %VMM(7) |
285 | VMOVA %VMM(6), %VMM(8) |
286 | .p2align 4 |
287 | L(second_aligned_loop): |
288 | VMOVU (VEC_SIZE * 4)(%rdi), %VMM(5) |
289 | VMOVU (VEC_SIZE * 5)(%rdi), %VMM(6) |
290 | VPCMP $4, %VMM(5), %VMATCH, %k2 |
291 | VPCMP $4, %VMM(6), %VMATCH, %k3{%k2} |
292 | |
293 | VPMIN %VMM(5), %VMM(6), %VMM(4) |
294 | |
295 | VPTEST %VMM(4), %VMM(4), %k1{%k3} |
296 | subq $(VEC_SIZE * -2), %rdi |
297 | KMOV %k1, %VRCX |
298 | inc %RCX_M |
299 | jz L(second_aligned_loop) |
300 | VPTESTN %VMM(4), %VMM(4), %k1 |
301 | KMOV %k1, %VRDX |
302 | test %VRDX, %VRDX |
303 | jz L(second_aligned_loop_set_furthest_match) |
304 | |
305 | KORTEST_M %k3, %k3 |
306 | jnc L(return_new_match) |
307 | /* branch here because there is a significant advantage interms |
308 | of output dependency chance in using edx. */ |
309 | |
310 | L(return_old_match): |
311 | VPCMPEQ %VMM(8), %VMATCH, %k0 |
312 | KMOV %k0, %VRCX |
313 | bsr %VRCX, %VRCX |
314 | jnz L(return_old_match_ret) |
315 | |
316 | VPCMPEQ %VMM(7), %VMATCH, %k0 |
317 | KMOV %k0, %VRCX |
318 | bsr %VRCX, %VRCX |
319 | subq $VEC_SIZE, %rsi |
320 | L(return_old_match_ret): |
321 | leaq (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax |
322 | ret |
323 | |
324 | L(return_new_match): |
325 | VPTESTN %VMM(5), %VMM(5), %k0 |
326 | KMOV %k0, %VRCX |
327 | blsmsk %VRCX, %VRCX |
328 | jnc L(return_new_match_first) |
329 | dec %VRDX |
330 | VPCMPEQ %VMM(6), %VMATCH, %k0 |
331 | KMOV %k0, %VRAX |
332 | addq $VEC_SIZE, %rdi |
333 | and %VRDX, %VRAX |
334 | jnz L(return_new_match_ret) |
335 | subq $VEC_SIZE, %rdi |
336 | L(return_new_match_first): |
337 | KMOV %k2, %VRAX |
338 | # ifdef USE_AS_WCSRCHR |
339 | xorl $((1 << CHAR_PER_VEC)- 1), %VRAX |
340 | and %VRCX, %VRAX |
341 | # else |
342 | andn %VRCX, %VRAX, %VRAX |
343 | # endif |
344 | jz L(return_old_match) |
345 | L(return_new_match_ret): |
346 | bsr %VRAX, %VRAX |
347 | leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax |
348 | ret |
349 | |
350 | L(cross_page_boundary): |
351 | /* eax contains all the page offset bits of src (rdi). `xor rdi, |
352 | rax` sets pointer will all page offset bits cleared so |
353 | offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC |
354 | before page cross (guaranteed to be safe to read). Doing this |
355 | as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves |
356 | a bit of code size. */ |
357 | xorq %rdi, %rax |
358 | VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) |
359 | VPTESTN %VMM(1), %VMM(1), %k0 |
360 | KMOV %k0, %VRSI |
361 | |
362 | /* Shift out zero CHAR matches that are before the beginning of |
363 | src (rdi). */ |
364 | # if VEC_SIZE == 64 || (defined USE_AS_WCSRCHR) |
365 | movl %edi, %ecx |
366 | # endif |
367 | # ifdef USE_AS_WCSRCHR |
368 | andl $(VEC_SIZE - 1), %ecx |
369 | shrl $2, %ecx |
370 | # endif |
371 | SHIFT_R (%SHIFT_REG, %VRSI) |
372 | # if VEC_SIZE == 32 || (defined USE_AS_WCSRCHR) |
373 | /* For strrchr-evex512 we use SHIFT_R as shr which will set zero |
374 | flag. */ |
375 | test %VRSI, %VRSI |
376 | # endif |
377 | jz L(page_cross_continue) |
378 | |
379 | /* Found zero CHAR so need to test for search CHAR. */ |
380 | VPCMPEQ %VMATCH, %VMM(1), %k1 |
381 | KMOV %k1, %VRAX |
382 | /* Shift out search CHAR matches that are before the beginning of |
383 | src (rdi). */ |
384 | SHIFT_R (%SHIFT_REG, %VRAX) |
385 | /* Check if any search CHAR match in range. */ |
386 | blsmsk %VRSI, %VRSI |
387 | and %VRSI, %VRAX |
388 | jz L(ret2) |
389 | bsr %VRAX, %VRAX |
390 | # ifdef USE_AS_WCSRCHR |
391 | leaq (%rdi, %rax, CHAR_SIZE), %rax |
392 | # else |
393 | addq %rdi, %rax |
394 | # endif |
395 | L(ret2): |
396 | ret |
397 | /* 3 bytes from cache-line for evex. */ |
398 | /* 0 bytes from cache-line for evex512. */ |
399 | END(STRRCHR) |
400 | #endif |
401 | |