1 | #include <assert.h> |
2 | |
3 | #include <libnu/defines.h> |
4 | #include <libnu/ducet.h> |
5 | #include <libnu/strcoll.h> |
6 | #include <libnu/strcoll_internal.h> |
7 | |
8 | #if (defined NU_WITH_Z_COLLATION) || (defined NU_WITH_N_COLLATION) |
9 | |
10 | int32_t _compound_weight(int32_t w, |
11 | const char **encoded, const char *limit, |
12 | nu_read_iterator_t read, nu_compound_read_t com, |
13 | const char **tail, |
14 | nu_codepoint_weight_t weight, void *context) { |
15 | |
16 | const char *tailp = *tail; |
17 | |
18 | const char *p = *encoded; |
19 | int32_t new_w = w; |
20 | int32_t consumed = 1; /* one codepoint was consumed at the top of the stack (_nu_strcoll) */ |
21 | |
22 | while (p < limit) { |
23 | uint32_t u = 0; |
24 | |
25 | const char *np = com(p, limit, read, &u, &tailp); |
26 | new_w = weight(u, &w, context); |
27 | |
28 | /* after this point, w might hold rollback value |
29 | * and new_w holds actual weight */ |
30 | |
31 | ++consumed; |
32 | |
33 | if (new_w >= 0) { |
34 | /* if w == 0 or w == 1, then *p or *np is already pointing |
35 | * to needed place, otherwise re-read encoded in the forward |
36 | * direction preserving correctness of tail pointer */ |
37 | if (w != 0 && w != 1) { |
38 | assert(consumed + w > 1); |
39 | |
40 | np = *encoded; |
41 | tailp = *tail; |
42 | |
43 | for (int32_t i = 0; i < consumed - w; ++i) { |
44 | np = com(np, limit, read, 0, &tailp); |
45 | } |
46 | |
47 | w = 0; |
48 | } |
49 | |
50 | *encoded = (w == 0 ? np : p); |
51 | *tail = tailp; |
52 | |
53 | break; |
54 | } |
55 | |
56 | p = np; |
57 | w = new_w; |
58 | } |
59 | |
60 | if (new_w < 0) { |
61 | new_w = weight(0, &w, context); |
62 | } |
63 | |
64 | assert(new_w >= 0); |
65 | |
66 | return new_w; |
67 | } |
68 | |
69 | inline |
70 | int _nu_strcoll(const char *lhs, const char *lhs_limit, |
71 | const char *rhs, const char *rhs_limit, |
72 | nu_read_iterator_t it1, nu_read_iterator_t it2, |
73 | nu_compound_read_t com1, nu_compound_read_t com2, |
74 | nu_codepoint_weight_t weight, void *context, |
75 | ssize_t *collated_left, ssize_t *collated_right) { |
76 | |
77 | int cmp = 0; |
78 | |
79 | const char *lp = lhs, *rp = rhs; |
80 | const char *ltailp = 0, *rtailp = 0; |
81 | |
82 | uint32_t u1 = 0, u2 = 0; |
83 | |
84 | while ((lp < lhs_limit && rp < rhs_limit) |
85 | || (ltailp != 0 && rp < rhs_limit) |
86 | || (rtailp != 0 && lp < lhs_limit)) { |
87 | |
88 | lp = com1(lp, lhs_limit, it1, &u1, <ailp); |
89 | rp = com2(rp, rhs_limit, it2, &u2, &rtailp); |
90 | |
91 | #ifdef NU_DISABLE_CONTRACTIONS |
92 | /* if contractions are disabled, then same codepoints |
93 | * will produce same weights and there is no need |
94 | * to weight each, i.e. weight(u1) == weight(u2) and |
95 | * collation may proceed to next codepoints */ |
96 | if (u1 != u2) { |
97 | #endif |
98 | int32_t w1 = weight(u1, 0, context); |
99 | int32_t w2 = weight(u2, 0, context); |
100 | |
101 | if (w1 < 0) { |
102 | w1 = _compound_weight(w: w1, encoded: &lp, limit: lhs_limit, |
103 | read: it1, com: com1, tail: <ailp, |
104 | weight, context); |
105 | } |
106 | |
107 | if (w2 < 0) { |
108 | w2 = _compound_weight(w: w2, encoded: &rp, limit: rhs_limit, |
109 | read: it2, com: com2, tail: &rtailp, |
110 | weight, context); |
111 | } |
112 | |
113 | assert(w1 >= 0); |
114 | assert(w2 >= 0); |
115 | |
116 | if (w1 < w2) { |
117 | cmp = -1; |
118 | break; |
119 | } |
120 | else if (w1 > w2) { |
121 | cmp = 1; |
122 | break; |
123 | } |
124 | |
125 | #ifdef NU_DISABLE_CONTRACTIONS |
126 | } |
127 | #endif |
128 | |
129 | if (u1 == 0 || u2 == 0) { |
130 | break; |
131 | } |
132 | } |
133 | |
134 | /* collated_left and collated_right should count |
135 | * number of successfully collated bytes, not taking |
136 | * into account limits. therefore if cmp != 0, |
137 | * number of collated bytes is decreased by (at least) 1 |
138 | * and cmp is limits-fixed afterwards */ |
139 | |
140 | if (collated_left != 0) { |
141 | *collated_left = (lp - lhs) - (cmp == 0 ? 0 : 1); |
142 | } |
143 | |
144 | if (collated_right != 0) { |
145 | *collated_right = (rp - rhs) - (cmp == 0 ? 0 : 1); |
146 | } |
147 | |
148 | if (cmp == 0) { |
149 | if (rp < rhs_limit && lp >= lhs_limit) { |
150 | cmp = -1; |
151 | } |
152 | else if (lp < lhs_limit && rp >= rhs_limit) { |
153 | cmp = 1; |
154 | } |
155 | } |
156 | |
157 | return cmp; |
158 | } |
159 | |
160 | inline |
161 | const char* _nu_strchr(const char *lhs, const char *lhs_limit, |
162 | uint32_t c, nu_read_iterator_t read, |
163 | nu_compound_read_t com, |
164 | nu_casemapping_t casemap, nu_read_iterator_t casemap_read) { |
165 | |
166 | const char *p = lhs; |
167 | const char *tail = 0; |
168 | uint32_t u = 0; |
169 | |
170 | const char *rhs = 0; |
171 | |
172 | if (casemap != 0) { |
173 | rhs = casemap(c); |
174 | if (rhs != 0) { |
175 | rhs = casemap_read(rhs, &c); /* read new lead codepoint */ |
176 | } |
177 | } |
178 | |
179 | while (p < lhs_limit) { |
180 | const char *np = com(p, lhs_limit, read, &u, &tail); |
181 | |
182 | if (u == 0) { |
183 | break; |
184 | } |
185 | |
186 | if (u == c) { |
187 | if (rhs == 0) { |
188 | return p; |
189 | } |
190 | |
191 | /* rhs != 0 */ |
192 | |
193 | const char *rp = rhs; |
194 | uint32_t u2 = 0; |
195 | |
196 | do { |
197 | rp = casemap_read(rp, &u2); |
198 | |
199 | if (u2 == 0) { |
200 | return p; /* succ exit point */ |
201 | } |
202 | |
203 | if (np >= lhs_limit) { |
204 | return 0; |
205 | } |
206 | |
207 | np = com(np, lhs_limit, read, &u, &tail); |
208 | |
209 | if (u == 0) { |
210 | return 0; |
211 | } |
212 | |
213 | if (u != u2) { |
214 | break; |
215 | } |
216 | } |
217 | while (u2 != 0); |
218 | } |
219 | |
220 | p = np; |
221 | } |
222 | |
223 | return 0; |
224 | } |
225 | |
226 | inline |
227 | const char* _nu_strrchr(const char *encoded, const char *limit, |
228 | uint32_t c, nu_read_iterator_t read, |
229 | nu_compound_read_t com, |
230 | nu_casemapping_t casemap, nu_read_iterator_t casemap_read) { |
231 | |
232 | /* there is probably not much sense in finding string end by decoding it |
233 | * and then reverse read string again to find last codepoint, therefore |
234 | * this is a sequence of _nu_strchr() in forward direction |
235 | * |
236 | * please let me know if i'm wrong */ |
237 | |
238 | const char *p = encoded; |
239 | const char *last = 0; |
240 | |
241 | while (p < limit) { |
242 | p = _nu_strchr(lhs: p, lhs_limit: limit, c, read, com, casemap, casemap_read); |
243 | |
244 | if (p == 0) { |
245 | return last; |
246 | } |
247 | |
248 | last = p; |
249 | p = read(p, 0); /* skip one codepoint and continue */ |
250 | } |
251 | |
252 | return last; |
253 | } |
254 | |
255 | inline |
256 | const char* _nu_strstr(const char *haystack, const char *haystack_limit, |
257 | const char *needle, const char *needle_limit, |
258 | nu_read_iterator_t it1, nu_read_iterator_t it2, |
259 | nu_compound_read_t com1, nu_compound_read_t com2, |
260 | nu_casemapping_t casemap, nu_read_iterator_t casemap_read, |
261 | nu_codepoint_weight_t weight, void *context) { |
262 | |
263 | uint32_t n0 = 0; |
264 | if (needle_limit != needle) { |
265 | it2(needle, &n0); |
266 | } |
267 | |
268 | if (needle_limit == needle || n0 == 0) { |
269 | return haystack; |
270 | } |
271 | |
272 | ssize_t needle_len = (needle_limit != NU_UNLIMITED |
273 | ? (needle_limit - needle) |
274 | : nu_strbytelen(encoded: needle, it: it2)); |
275 | |
276 | const char *h0 = haystack; |
277 | do { |
278 | h0 = _nu_strchr(lhs: h0, lhs_limit: haystack_limit, |
279 | c: n0, read: it1, |
280 | com: com1, |
281 | casemap, casemap_read); |
282 | |
283 | if (h0 == 0) { |
284 | break; |
285 | } |
286 | |
287 | ssize_t collated_left = 0, collated_right = 0; |
288 | _nu_strcoll(lhs: h0, lhs_limit: haystack_limit, rhs: needle, rhs_limit: needle_limit, |
289 | it1, it2, |
290 | com1, com2, |
291 | weight, context, |
292 | collated_left: &collated_left, collated_right: &collated_right); |
293 | |
294 | /* it doesn't matter what collate result is |
295 | * if whole needle was successfully collated */ |
296 | if (collated_right >= needle_len) { |
297 | return h0; |
298 | } |
299 | |
300 | /* skip one codepoint in haystack */ |
301 | if (h0 < haystack_limit) { |
302 | h0 = it1(h0, 0); |
303 | } |
304 | } |
305 | while (h0 != 0 && h0 < haystack_limit); |
306 | |
307 | return 0; |
308 | } |
309 | |
310 | #ifdef NU_WITH_Z_COLLATION |
311 | |
312 | const char* nu_strchr(const char *encoded, uint32_t c, nu_read_iterator_t read) { |
313 | return _nu_strchr(lhs: encoded, NU_UNLIMITED, |
314 | c, read, |
315 | com: nu_default_compound_read, |
316 | casemap: 0, casemap_read: 0); |
317 | } |
318 | |
319 | const char* nu_strcasechr(const char *encoded, uint32_t c, nu_read_iterator_t read) { |
320 | return _nu_strchr(lhs: encoded, NU_UNLIMITED, |
321 | c, read, |
322 | com: nu_nocase_compound_read, |
323 | NU_FOLDING_FUNCTION, nu_casemap_read); |
324 | } |
325 | |
326 | const char* nu_strrchr(const char *encoded, uint32_t c, nu_read_iterator_t read) { |
327 | return _nu_strrchr(encoded, NU_UNLIMITED, |
328 | c, read, |
329 | com: nu_default_compound_read, |
330 | casemap: 0, casemap_read: 0); |
331 | } |
332 | |
333 | const char* nu_strrcasechr(const char *encoded, uint32_t c, nu_read_iterator_t read) { |
334 | return _nu_strrchr(encoded, NU_UNLIMITED, c, read, |
335 | com: nu_nocase_compound_read, |
336 | NU_FOLDING_FUNCTION, nu_casemap_read); |
337 | } |
338 | |
339 | int nu_strcoll(const char *s1, const char *s2, |
340 | nu_read_iterator_t s1_read, nu_read_iterator_t s2_read) { |
341 | return _nu_strcoll(lhs: s1, NU_UNLIMITED, rhs: s2, NU_UNLIMITED, |
342 | it1: s1_read, it2: s2_read, |
343 | com1: nu_default_compound_read, com2: nu_default_compound_read, |
344 | weight: nu_ducet_weight, context: 0, |
345 | collated_left: 0, collated_right: 0); |
346 | } |
347 | |
348 | int nu_strcasecoll(const char *s1, const char *s2, |
349 | nu_read_iterator_t s1_read, nu_read_iterator_t s2_read) { |
350 | return _nu_strcoll(lhs: s1, NU_UNLIMITED, rhs: s2, NU_UNLIMITED, |
351 | it1: s1_read, it2: s2_read, |
352 | com1: nu_nocase_compound_read, com2: nu_nocase_compound_read, |
353 | weight: nu_ducet_weight, context: 0, |
354 | collated_left: 0, collated_right: 0); |
355 | } |
356 | |
357 | const char* nu_strstr(const char *haystack, const char *needle, |
358 | nu_read_iterator_t haystack_read, nu_read_iterator_t needle_read) { |
359 | return _nu_strstr(haystack, NU_UNLIMITED, needle, NU_UNLIMITED, |
360 | it1: haystack_read, it2: needle_read, |
361 | com1: nu_default_compound_read, com2: nu_default_compound_read, |
362 | casemap: 0, casemap_read: 0, |
363 | weight: nu_ducet_weight, context: 0); |
364 | } |
365 | |
366 | const char* nu_strcasestr(const char *haystack, const char *needle, |
367 | nu_read_iterator_t haystack_read, nu_read_iterator_t needle_read) { |
368 | return _nu_strstr(haystack, NU_UNLIMITED, needle, NU_UNLIMITED, |
369 | it1: haystack_read, it2: needle_read, |
370 | com1: nu_nocase_compound_read, com2: nu_nocase_compound_read, |
371 | NU_FOLDING_FUNCTION, nu_casemap_read, |
372 | weight: nu_ducet_weight, context: 0); |
373 | } |
374 | |
375 | #endif /* NU_WITH_Z_COLLATION */ |
376 | |
377 | #ifdef NU_WITH_N_COLLATION |
378 | |
379 | const char* nu_strnchr(const char *encoded, size_t max_len, uint32_t c, nu_read_iterator_t read) { |
380 | return _nu_strchr(encoded, encoded + max_len, |
381 | c, read, |
382 | nu_default_compound_read, |
383 | 0, 0); |
384 | } |
385 | |
386 | const char* nu_strcasenchr(const char *encoded, size_t max_len, uint32_t c, nu_read_iterator_t read) { |
387 | return _nu_strchr(encoded, encoded + max_len, |
388 | c, read, |
389 | nu_nocase_compound_read, |
390 | NU_FOLDING_FUNCTION, nu_casemap_read); |
391 | } |
392 | |
393 | const char* nu_strrnchr(const char *encoded, size_t max_len, uint32_t c, nu_read_iterator_t read) { |
394 | return _nu_strrchr(encoded, encoded + max_len, |
395 | c, read, |
396 | nu_default_compound_read, |
397 | 0, 0); |
398 | } |
399 | |
400 | const char* nu_strrcasenchr(const char *encoded, size_t max_len, uint32_t c, |
401 | nu_read_iterator_t read) { |
402 | return _nu_strrchr(encoded, encoded + max_len, |
403 | c, read, |
404 | nu_nocase_compound_read, |
405 | NU_FOLDING_FUNCTION, nu_casemap_read); |
406 | } |
407 | |
408 | int nu_strncoll(const char *s1, size_t s1_max_len, |
409 | const char *s2, size_t s2_max_len, |
410 | nu_read_iterator_t s1_read, nu_read_iterator_t s2_read) { |
411 | return _nu_strcoll(s1, s1 + s1_max_len, s2, s2 + s2_max_len, |
412 | s1_read, s2_read, |
413 | nu_default_compound_read, nu_default_compound_read, |
414 | nu_ducet_weight, 0, |
415 | 0, 0); |
416 | } |
417 | |
418 | int nu_strcasencoll(const char *s1, size_t s1_max_len, |
419 | const char *s2, size_t s2_max_len, |
420 | nu_read_iterator_t s1_read, nu_read_iterator_t s2_read) { |
421 | return _nu_strcoll(s1, s1 + s1_max_len, s2, s2 + s2_max_len, |
422 | s1_read, s2_read, |
423 | nu_nocase_compound_read, nu_nocase_compound_read, |
424 | nu_ducet_weight, 0, |
425 | 0, 0); |
426 | } |
427 | |
428 | const char* nu_strnstr(const char *haystack, size_t haystack_max_len, |
429 | const char *needle, size_t needle_max_len, |
430 | nu_read_iterator_t haystack_read, nu_read_iterator_t needle_read) { |
431 | return _nu_strstr(haystack, haystack + haystack_max_len, |
432 | needle, needle + needle_max_len, |
433 | haystack_read, needle_read, |
434 | nu_default_compound_read, nu_default_compound_read, |
435 | 0, 0, |
436 | nu_ducet_weight, 0); |
437 | } |
438 | |
439 | const char* nu_strcasenstr(const char *haystack, size_t haystack_max_len, |
440 | const char *needle, size_t needle_max_len, |
441 | nu_read_iterator_t haystack_read, nu_read_iterator_t needle_read) { |
442 | return _nu_strstr(haystack, haystack + haystack_max_len, |
443 | needle, needle + needle_max_len, |
444 | haystack_read, needle_read, |
445 | nu_nocase_compound_read, nu_nocase_compound_read, |
446 | NU_FOLDING_FUNCTION, nu_casemap_read, |
447 | nu_ducet_weight, 0); |
448 | } |
449 | |
450 | #endif /* NU_WITH_N_COLLATION */ |
451 | |
452 | #endif /* NU_WITH_Z_COLLATION || NU_WITH_N_COLLATION */ |
453 | |