1 | /* Copyright (C) 1996-2022 Free Software Foundation, Inc. |
2 | This file is part of the GNU C Library. |
3 | |
4 | The GNU C Library is free software; you can redistribute it and/or |
5 | modify it under the terms of the GNU Lesser General Public |
6 | License as published by the Free Software Foundation; either |
7 | version 2.1 of the License, or (at your option) any later version. |
8 | |
9 | The GNU C Library is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | Lesser General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU Lesser General Public |
15 | License along with the GNU C Library. If not, see |
16 | <https://www.gnu.org/licenses/>. */ |
17 | |
18 | /* Bytewise compare two null-terminated strings of length no longer than N. */ |
19 | |
20 | #include <sysdep.h> |
21 | |
22 | .set noat |
23 | .set noreorder |
24 | |
25 | /* EV6 only predicts one branch per octaword. We'll use these to push |
26 | subsequent branches back to the next bundle. This will generally add |
27 | a fetch+decode cycle to older machines, so skip in that case. */ |
28 | #ifdef __alpha_fix__ |
29 | # define ev6_unop unop |
30 | #else |
31 | # define ev6_unop |
32 | #endif |
33 | |
34 | .text |
35 | |
36 | ENTRY(strncmp) |
37 | #ifdef PROF |
38 | ldgp gp, 0(pv) |
39 | lda AT, _mcount |
40 | jsr AT, (AT), _mcount |
41 | .prologue 1 |
42 | #else |
43 | .prologue 0 |
44 | #endif |
45 | |
46 | xor a0, a1, t2 # are s1 and s2 co-aligned? |
47 | beq a2, $zerolength |
48 | ldq_u t0, 0(a0) # load asap to give cache time to catch up |
49 | ldq_u t1, 0(a1) |
50 | lda t3, -1 |
51 | and t2, 7, t2 |
52 | srl t3, 1, t6 |
53 | and a0, 7, t4 # find s1 misalignment |
54 | and a1, 7, t5 # find s2 misalignment |
55 | cmovlt a2, t6, a2 # bound neg count to LONG_MAX |
56 | addq a1, a2, a3 # s2+count |
57 | addq a2, t4, a2 # bias count by s1 misalignment |
58 | and a2, 7, t10 # ofs of last byte in s1 last word |
59 | srl a2, 3, a2 # remaining full words in s1 count |
60 | bne t2, $unaligned |
61 | |
62 | /* On entry to this basic block: |
63 | t0 == the first word of s1. |
64 | t1 == the first word of s2. |
65 | t3 == -1. */ |
66 | $aligned: |
67 | mskqh t3, a1, t8 # mask off leading garbage |
68 | ornot t1, t8, t1 |
69 | ornot t0, t8, t0 |
70 | cmpbge zero, t1, t7 # bits set iff null found |
71 | beq a2, $eoc # check end of count |
72 | bne t7, $eos |
73 | beq t10, $ant_loop |
74 | |
75 | /* Aligned compare main loop. |
76 | On entry to this basic block: |
77 | t0 == an s1 word. |
78 | t1 == an s2 word not containing a null. */ |
79 | |
80 | .align 4 |
81 | $a_loop: |
82 | xor t0, t1, t2 # e0 : |
83 | bne t2, $wordcmp # .. e1 (zdb) |
84 | ldq_u t1, 8(a1) # e0 : |
85 | ldq_u t0, 8(a0) # .. e1 : |
86 | |
87 | subq a2, 1, a2 # e0 : |
88 | addq a1, 8, a1 # .. e1 : |
89 | addq a0, 8, a0 # e0 : |
90 | beq a2, $eoc # .. e1 : |
91 | |
92 | cmpbge zero, t1, t7 # e0 : |
93 | beq t7, $a_loop # .. e1 : |
94 | |
95 | br $eos |
96 | |
97 | /* Alternate aligned compare loop, for when there's no trailing |
98 | bytes on the count. We have to avoid reading too much data. */ |
99 | .align 4 |
100 | $ant_loop: |
101 | xor t0, t1, t2 # e0 : |
102 | ev6_unop |
103 | ev6_unop |
104 | bne t2, $wordcmp # .. e1 (zdb) |
105 | |
106 | subq a2, 1, a2 # e0 : |
107 | beq a2, $zerolength # .. e1 : |
108 | ldq_u t1, 8(a1) # e0 : |
109 | ldq_u t0, 8(a0) # .. e1 : |
110 | |
111 | addq a1, 8, a1 # e0 : |
112 | addq a0, 8, a0 # .. e1 : |
113 | cmpbge zero, t1, t7 # e0 : |
114 | beq t7, $ant_loop # .. e1 : |
115 | |
116 | br $eos |
117 | |
118 | /* The two strings are not co-aligned. Align s1 and cope. */ |
119 | /* On entry to this basic block: |
120 | t0 == the first word of s1. |
121 | t1 == the first word of s2. |
122 | t3 == -1. |
123 | t4 == misalignment of s1. |
124 | t5 == misalignment of s2. |
125 | t10 == misalignment of s1 end. */ |
126 | .align 4 |
127 | $unaligned: |
128 | /* If s1 misalignment is larger than s2 misalignment, we need |
129 | extra startup checks to avoid SEGV. */ |
130 | subq a1, t4, a1 # adjust s2 for s1 misalignment |
131 | cmpult t4, t5, t9 |
132 | subq a3, 1, a3 # last byte of s2 |
133 | bic a1, 7, t8 |
134 | mskqh t3, t5, t7 # mask garbage in s2 |
135 | subq a3, t8, a3 |
136 | ornot t1, t7, t7 |
137 | srl a3, 3, a3 # remaining full words in s2 count |
138 | beq t9, $u_head |
139 | |
140 | /* Failing that, we need to look for both eos and eoc within the |
141 | first word of s2. If we find either, we can continue by |
142 | pretending that the next word of s2 is all zeros. */ |
143 | lda t2, 0 # next = zero |
144 | cmpeq a3, 0, t8 # eoc in the first word of s2? |
145 | cmpbge zero, t7, t7 # eos in the first word of s2? |
146 | or t7, t8, t8 |
147 | bne t8, $u_head_nl |
148 | |
149 | /* We know just enough now to be able to assemble the first |
150 | full word of s2. We can still find a zero at the end of it. |
151 | |
152 | On entry to this basic block: |
153 | t0 == first word of s1 |
154 | t1 == first partial word of s2. |
155 | t3 == -1. |
156 | t10 == ofs of last byte in s1 last word. |
157 | t11 == ofs of last byte in s2 last word. */ |
158 | $u_head: |
159 | ldq_u t2, 8(a1) # load second partial s2 word |
160 | subq a3, 1, a3 |
161 | $u_head_nl: |
162 | extql t1, a1, t1 # create first s2 word |
163 | mskqh t3, a0, t8 |
164 | extqh t2, a1, t4 |
165 | ornot t0, t8, t0 # kill s1 garbage |
166 | or t1, t4, t1 # s2 word now complete |
167 | cmpbge zero, t0, t7 # find eos in first s1 word |
168 | ornot t1, t8, t1 # kill s2 garbage |
169 | beq a2, $eoc |
170 | subq a2, 1, a2 |
171 | bne t7, $eos |
172 | mskql t3, a1, t8 # mask out s2[1] bits we have seen |
173 | xor t0, t1, t4 # compare aligned words |
174 | or t2, t8, t8 |
175 | bne t4, $wordcmp |
176 | cmpbge zero, t8, t7 # eos in high bits of s2[1]? |
177 | cmpeq a3, 0, t8 # eoc in s2[1]? |
178 | or t7, t8, t7 |
179 | bne t7, $u_final |
180 | |
181 | /* Unaligned copy main loop. In order to avoid reading too much, |
182 | the loop is structured to detect zeros in aligned words from s2. |
183 | This has, unfortunately, effectively pulled half of a loop |
184 | iteration out into the head and half into the tail, but it does |
185 | prevent nastiness from accumulating in the very thing we want |
186 | to run as fast as possible. |
187 | |
188 | On entry to this basic block: |
189 | t2 == the unshifted low-bits from the next s2 word. |
190 | t10 == ofs of last byte in s1 last word. |
191 | t11 == ofs of last byte in s2 last word. */ |
192 | .align 4 |
193 | $u_loop: |
194 | extql t2, a1, t3 # e0 : |
195 | ldq_u t2, 16(a1) # .. e1 : load next s2 high bits |
196 | ldq_u t0, 8(a0) # e0 : load next s1 word |
197 | addq a1, 8, a1 # .. e1 : |
198 | |
199 | addq a0, 8, a0 # e0 : |
200 | subq a3, 1, a3 # .. e1 : |
201 | extqh t2, a1, t1 # e0 : |
202 | cmpbge zero, t0, t7 # .. e1 : eos in current s1 word |
203 | |
204 | or t1, t3, t1 # e0 : |
205 | beq a2, $eoc # .. e1 : eoc in current s1 word |
206 | subq a2, 1, a2 # e0 : |
207 | cmpbge zero, t2, t4 # .. e1 : eos in s2[1] |
208 | |
209 | xor t0, t1, t3 # e0 : compare the words |
210 | ev6_unop |
211 | ev6_unop |
212 | bne t7, $eos # .. e1 : |
213 | |
214 | cmpeq a3, 0, t5 # e0 : eoc in s2[1] |
215 | ev6_unop |
216 | ev6_unop |
217 | bne t3, $wordcmp # .. e1 : |
218 | |
219 | or t4, t5, t4 # e0 : eos or eoc in s2[1]. |
220 | beq t4, $u_loop # .. e1 (zdb) |
221 | |
222 | /* We've found a zero in the low bits of the last s2 word. Get |
223 | the next s1 word and align them. */ |
224 | .align 3 |
225 | $u_final: |
226 | ldq_u t0, 8(a0) |
227 | extql t2, a1, t1 |
228 | cmpbge zero, t1, t7 |
229 | bne a2, $eos |
230 | |
231 | /* We've hit end of count. Zero everything after the count |
232 | and compare whats left. */ |
233 | .align 3 |
234 | $eoc: |
235 | mskql t0, t10, t0 |
236 | mskql t1, t10, t1 |
237 | cmpbge zero, t1, t7 |
238 | |
239 | /* We've found a zero somewhere in a word we just read. |
240 | On entry to this basic block: |
241 | t0 == s1 word |
242 | t1 == s2 word |
243 | t7 == cmpbge mask containing the zero. */ |
244 | .align 3 |
245 | $eos: |
246 | negq t7, t6 # create bytemask of valid data |
247 | and t6, t7, t8 |
248 | subq t8, 1, t6 |
249 | or t6, t8, t7 |
250 | zapnot t0, t7, t0 # kill the garbage |
251 | zapnot t1, t7, t1 |
252 | xor t0, t1, v0 # ... and compare |
253 | beq v0, $done |
254 | |
255 | /* Here we have two differing co-aligned words in t0 & t1. |
256 | Bytewise compare them and return (t0 > t1 ? 1 : -1). */ |
257 | .align 3 |
258 | $wordcmp: |
259 | cmpbge t0, t1, t2 # comparison yields bit mask of ge |
260 | cmpbge t1, t0, t3 |
261 | xor t2, t3, t0 # bits set iff t0/t1 bytes differ |
262 | negq t0, t1 # clear all but least bit |
263 | and t0, t1, t0 |
264 | lda v0, -1 |
265 | and t0, t2, t1 # was bit set in t0 > t1? |
266 | cmovne t1, 1, v0 |
267 | $done: |
268 | ret |
269 | |
270 | .align 3 |
271 | $zerolength: |
272 | clr v0 |
273 | ret |
274 | |
275 | END(strncmp) |
276 | libc_hidden_builtin_def (strncmp) |
277 | |