1 | /* |
2 | * strrchr - find last position of a character in a string. |
3 | * |
4 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
5 | * See https://llvm.org/LICENSE.txt for license information. |
6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
7 | */ |
8 | |
9 | /* Assumptions: |
10 | * |
11 | * ARMv8-a, AArch64 |
12 | * Neon Available. |
13 | */ |
14 | |
15 | #include "../asmdefs.h" |
16 | |
17 | /* Arguments and results. */ |
18 | #define srcin x0 |
19 | #define chrin w1 |
20 | |
21 | #define result x0 |
22 | |
23 | #define src x2 |
24 | #define tmp1 x3 |
25 | #define wtmp2 w4 |
26 | #define tmp3 x5 |
27 | #define src_match x6 |
28 | #define src_offset x7 |
29 | #define const_m1 x8 |
30 | #define tmp4 x9 |
31 | #define nul_match x10 |
32 | #define chr_match x11 |
33 | |
34 | #define vrepchr v0 |
35 | #define vdata1 v1 |
36 | #define vdata2 v2 |
37 | #define vhas_nul1 v3 |
38 | #define vhas_nul2 v4 |
39 | #define vhas_chr1 v5 |
40 | #define vhas_chr2 v6 |
41 | #define vrepmask_0 v7 |
42 | #define vrepmask_c v16 |
43 | #define vend1 v17 |
44 | #define vend2 v18 |
45 | |
46 | /* Core algorithm. |
47 | |
48 | For each 32-byte hunk we calculate a 64-bit syndrome value, with |
49 | two bits per byte (LSB is always in bits 0 and 1, for both big |
50 | and little-endian systems). For each tuple, bit 0 is set iff |
51 | the relevant byte matched the requested character; bit 1 is set |
52 | iff the relevant byte matched the NUL end of string (we trigger |
53 | off bit0 for the special case of looking for NUL). Since the bits |
54 | in the syndrome reflect exactly the order in which things occur |
55 | in the original string a count_trailing_zeros() operation will |
56 | identify exactly which byte is causing the termination, and why. */ |
57 | |
58 | ENTRY (__strrchr_aarch64) |
59 | /* Magic constant 0x40100401 to allow us to identify which lane |
60 | matches the requested byte. Magic constant 0x80200802 used |
61 | similarly for NUL termination. */ |
62 | mov wtmp2, #0x0401 |
63 | movk wtmp2, #0x4010, lsl #16 |
64 | dup vrepchr.16b, chrin |
65 | bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ |
66 | dup vrepmask_c.4s, wtmp2 |
67 | mov src_offset, #0 |
68 | ands tmp1, srcin, #31 |
69 | add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ |
70 | b.eq L(aligned) |
71 | |
72 | /* Input string is not 32-byte aligned. Rather than forcing |
73 | the padding bytes to a safe value, we calculate the syndrome |
74 | for all the bytes, but then mask off those bits of the |
75 | syndrome that are related to the padding. */ |
76 | ld1 {vdata1.16b, vdata2.16b}, [src], #32 |
77 | neg tmp1, tmp1 |
78 | cmeq vhas_nul1.16b, vdata1.16b, #0 |
79 | cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b |
80 | cmeq vhas_nul2.16b, vdata2.16b, #0 |
81 | cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b |
82 | and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b |
83 | and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b |
84 | and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b |
85 | and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b |
86 | addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128 |
87 | addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 |
88 | addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b // 128->64 |
89 | addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64 |
90 | mov nul_match, vhas_nul1.d[0] |
91 | lsl tmp1, tmp1, #1 |
92 | mov const_m1, #~0 |
93 | mov chr_match, vhas_chr1.d[0] |
94 | lsr tmp3, const_m1, tmp1 |
95 | |
96 | bic nul_match, nul_match, tmp3 // Mask padding bits. |
97 | bic chr_match, chr_match, tmp3 // Mask padding bits. |
98 | cbnz nul_match, L(tail) |
99 | |
100 | L(loop): |
101 | cmp chr_match, #0 |
102 | csel src_match, src, src_match, ne |
103 | csel src_offset, chr_match, src_offset, ne |
104 | L(aligned): |
105 | ld1 {vdata1.16b, vdata2.16b}, [src], #32 |
106 | cmeq vhas_nul1.16b, vdata1.16b, #0 |
107 | cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b |
108 | cmeq vhas_nul2.16b, vdata2.16b, #0 |
109 | cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b |
110 | addp vend1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128 |
111 | and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b |
112 | and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b |
113 | addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 |
114 | addp vend1.16b, vend1.16b, vend1.16b // 128->64 |
115 | addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64 |
116 | mov nul_match, vend1.d[0] |
117 | mov chr_match, vhas_chr1.d[0] |
118 | cbz nul_match, L(loop) |
119 | |
120 | and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b |
121 | and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b |
122 | addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b |
123 | addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b |
124 | mov nul_match, vhas_nul1.d[0] |
125 | |
126 | L(tail): |
127 | /* Work out exactly where the string ends. */ |
128 | sub tmp4, nul_match, #1 |
129 | eor tmp4, tmp4, nul_match |
130 | ands chr_match, chr_match, tmp4 |
131 | /* And pick the values corresponding to the last match. */ |
132 | csel src_match, src, src_match, ne |
133 | csel src_offset, chr_match, src_offset, ne |
134 | |
135 | /* Count down from the top of the syndrome to find the last match. */ |
136 | clz tmp3, src_offset |
137 | /* Src_match points beyond the word containing the match, so we can |
138 | simply subtract half the bit-offset into the syndrome. Because |
139 | we are counting down, we need to go back one more character. */ |
140 | add tmp3, tmp3, #2 |
141 | sub result, src_match, tmp3, lsr #1 |
142 | /* But if the syndrome shows no match was found, then return NULL. */ |
143 | cmp src_offset, #0 |
144 | csel result, result, xzr, ne |
145 | |
146 | ret |
147 | |
148 | END (__strrchr_aarch64) |
149 | |