1 | /* Regular expression tests. |
2 | Copyright (C) 2003-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sys/types.h> |
20 | #include <mcheck.h> |
21 | #include <regex.h> |
22 | #include <stdio.h> |
23 | #include <stdlib.h> |
24 | #include <string.h> |
25 | #include <locale.h> |
26 | #include <getopt.h> |
27 | |
28 | static void |
29 | replace_special_chars (char *str) |
30 | { |
31 | for (; (str = strpbrk (str, "NTSZ" )) != NULL; ++str) |
32 | switch (*str) |
33 | { |
34 | case 'N': *str = '\n'; break; |
35 | case 'T': *str = '\t'; break; |
36 | case 'S': *str = ' '; break; |
37 | case 'Z': *str = '\0'; break; |
38 | } |
39 | } |
40 | |
41 | static void |
42 | glibc_re_syntax (char *str) |
43 | { |
44 | char *p, *end = strchr (str, '\0') + 1; |
45 | |
46 | /* Replace [[:<:]] with \< and [[:>:]] with \>. */ |
47 | for (p = str; (p = strstr (p, "[[:" )) != NULL; ) |
48 | if ((p[3] == '<' || p[3] == '>') && strncmp (p + 4, ":]]" , 3) == 0) |
49 | { |
50 | p[0] = '\\'; |
51 | p[1] = p[3]; |
52 | memmove (p + 2, p + 7, end - p - 7); |
53 | end -= 5; |
54 | p += 2; |
55 | } |
56 | else |
57 | p += 3; |
58 | } |
59 | |
60 | static char * |
61 | mb_replace (char *dst, const char c) |
62 | { |
63 | switch (c) |
64 | { |
65 | /* Replace a with \'a and A with \'A. */ |
66 | case 'a': |
67 | *dst++ = '\xc3'; |
68 | *dst++ = '\xa1'; |
69 | break; |
70 | case 'A': |
71 | *dst++ = '\xc3'; |
72 | *dst++ = '\x81'; |
73 | break; |
74 | /* Replace b with \v{c} and B with \v{C}. */ |
75 | case 'b': |
76 | *dst++ = '\xc4'; |
77 | *dst++ = '\x8d'; |
78 | break; |
79 | case 'B': |
80 | *dst++ = '\xc4'; |
81 | *dst++ = '\x8c'; |
82 | break; |
83 | /* Replace c with \v{d} and C with \v{D}. */ |
84 | case 'c': |
85 | *dst++ = '\xc4'; |
86 | *dst++ = '\x8f'; |
87 | break; |
88 | case 'C': |
89 | *dst++ = '\xc4'; |
90 | *dst++ = '\x8e'; |
91 | break; |
92 | /* Replace d with \'e and D with \'E. */ |
93 | case 'd': |
94 | *dst++ = '\xc3'; |
95 | *dst++ = '\xa9'; |
96 | break; |
97 | case 'D': |
98 | *dst++ = '\xc3'; |
99 | *dst++ = '\x89'; |
100 | break; |
101 | } |
102 | return dst; |
103 | } |
104 | |
105 | static char * |
106 | mb_frob_string (const char *str, const char *letters) |
107 | { |
108 | char *ret, *dst; |
109 | const char *src; |
110 | |
111 | if (str == NULL) |
112 | return NULL; |
113 | |
114 | ret = malloc (size: 2 * strlen (str) + 1); |
115 | if (ret == NULL) |
116 | return NULL; |
117 | |
118 | for (src = str, dst = ret; *src; ++src) |
119 | if (strchr (letters, *src)) |
120 | dst = mb_replace (dst, c: *src); |
121 | else |
122 | *dst++ = *src; |
123 | *dst = '\0'; |
124 | return ret; |
125 | } |
126 | |
127 | /* Like mb_frob_string, but don't replace anything between |
128 | [: and :], [. and .] or [= and =] or characters escaped |
129 | with a backslash. */ |
130 | |
131 | static char * |
132 | mb_frob_pattern (const char *str, const char *letters) |
133 | { |
134 | char *ret, *dst; |
135 | const char *src; |
136 | int in_class = 0, escaped = 0; |
137 | |
138 | if (str == NULL) |
139 | return NULL; |
140 | |
141 | ret = malloc (size: 2 * strlen (str) + 1); |
142 | if (ret == NULL) |
143 | return NULL; |
144 | |
145 | for (src = str, dst = ret; *src; ++src) |
146 | if (*src == '\\') |
147 | { |
148 | escaped ^= 1; |
149 | *dst++ = *src; |
150 | } |
151 | else if (escaped) |
152 | { |
153 | escaped = 0; |
154 | *dst++ = *src; |
155 | continue; |
156 | } |
157 | else if (!in_class && strchr (letters, *src)) |
158 | dst = mb_replace (dst, c: *src); |
159 | else |
160 | { |
161 | if (!in_class && *src == '[' && strchr (":.=" , src[1])) |
162 | in_class = 1; |
163 | else if (in_class && *src == ']' && strchr (":.=" , src[-1])) |
164 | in_class = 0; |
165 | *dst++ = *src; |
166 | } |
167 | *dst = '\0'; |
168 | return ret; |
169 | } |
170 | |
171 | static int |
172 | check_match (regmatch_t *rm, int idx, const char *string, |
173 | const char *match, const char *fail) |
174 | { |
175 | if (match[0] == '-' && match[1] == '\0') |
176 | { |
177 | if (rm[idx].rm_so == -1 && rm[idx].rm_eo == -1) |
178 | return 0; |
179 | printf (format: "%s rm[%d] unexpectedly matched\n" , fail, idx); |
180 | return 1; |
181 | } |
182 | |
183 | if (rm[idx].rm_so == -1 || rm[idx].rm_eo == -1) |
184 | { |
185 | printf (format: "%s rm[%d] unexpectedly did not match\n" , fail, idx); |
186 | return 1; |
187 | } |
188 | |
189 | if (match[0] == '@') |
190 | { |
191 | if (rm[idx].rm_so != rm[idx].rm_eo) |
192 | { |
193 | printf (format: "%s rm[%d] not empty\n" , fail, idx); |
194 | return 1; |
195 | } |
196 | |
197 | if (strncmp (string + rm[idx].rm_so, match + 1, strlen (match + 1) ?: 1)) |
198 | { |
199 | printf (format: "%s rm[%d] not matching %s\n" , fail, idx, match); |
200 | return 1; |
201 | } |
202 | return 0; |
203 | } |
204 | |
205 | if (rm[idx].rm_eo - rm[idx].rm_so != strlen (match) |
206 | || strncmp (string + rm[idx].rm_so, match, |
207 | rm[idx].rm_eo - rm[idx].rm_so)) |
208 | { |
209 | printf (format: "%s rm[%d] not matching %s\n" , fail, idx, match); |
210 | return 1; |
211 | } |
212 | |
213 | return 0; |
214 | } |
215 | |
216 | static int |
217 | test (const char *pattern, int cflags, const char *string, int eflags, |
218 | char *expect, char *matches, const char *fail) |
219 | { |
220 | regex_t re; |
221 | regmatch_t rm[10]; |
222 | int n, ret = 0; |
223 | |
224 | n = regcomp (preg: &re, pattern: pattern, cflags: cflags); |
225 | if (n != 0) |
226 | { |
227 | char buf[500]; |
228 | if (eflags == -1) |
229 | { |
230 | static struct { reg_errcode_t code; const char *name; } codes [] |
231 | #define C(x) { REG_##x, #x } |
232 | = { C(NOERROR), C(NOMATCH), C(BADPAT), C(ECOLLATE), |
233 | C(ECTYPE), C(EESCAPE), C(ESUBREG), C(EBRACK), |
234 | C(EPAREN), C(EBRACE), C(BADBR), C(ERANGE), |
235 | C(ESPACE), C(BADRPT) }; |
236 | |
237 | for (int i = 0; i < sizeof (codes) / sizeof (codes[0]); ++i) |
238 | if (n == codes[i].code) |
239 | { |
240 | if (strcmp (string, codes[i].name)) |
241 | { |
242 | printf (format: "%s regcomp returned REG_%s (expected REG_%s)\n" , |
243 | fail, codes[i].name, string); |
244 | return 1; |
245 | } |
246 | return 0; |
247 | } |
248 | |
249 | printf (format: "%s regcomp return value REG_%d\n" , fail, n); |
250 | return 1; |
251 | } |
252 | |
253 | regerror (errcode: n, preg: &re, errbuf: buf, errbuf_size: sizeof (buf)); |
254 | printf (format: "%s regcomp failed: %s\n" , fail, buf); |
255 | return 1; |
256 | } |
257 | |
258 | if (eflags == -1) |
259 | { |
260 | regfree (preg: &re); |
261 | |
262 | /* The test case file assumes something only guaranteed by the |
263 | rxspencer regex implementation. Namely that for empty |
264 | expressions regcomp() return REG_EMPTY. This is not the case |
265 | for us and so we ignore this error. */ |
266 | if (strcmp (string, "EMPTY" ) == 0) |
267 | return 0; |
268 | |
269 | printf (format: "%s regcomp unexpectedly succeeded\n" , fail); |
270 | return 1; |
271 | } |
272 | |
273 | if (regexec (preg: &re, String: string, nmatch: 10, pmatch: rm, eflags: eflags)) |
274 | { |
275 | regfree (preg: &re); |
276 | if (expect == NULL) |
277 | return 0; |
278 | printf (format: "%s regexec failed\n" , fail); |
279 | return 1; |
280 | } |
281 | |
282 | regfree (preg: &re); |
283 | |
284 | if (expect == NULL) |
285 | { |
286 | printf (format: "%s regexec unexpectedly succeeded\n" , fail); |
287 | return 1; |
288 | } |
289 | |
290 | if (cflags & REG_NOSUB) |
291 | return 0; |
292 | |
293 | ret = check_match (rm, idx: 0, string, match: expect, fail); |
294 | if (matches == NULL) |
295 | return ret; |
296 | |
297 | for (n = 1; ret == 0 && n < 10; ++n) |
298 | { |
299 | char *p = NULL; |
300 | |
301 | if (matches) |
302 | { |
303 | p = strchr (matches, ','); |
304 | if (p != NULL) |
305 | *p = '\0'; |
306 | } |
307 | ret = check_match (rm, idx: n, string, match: matches ?: "-" , fail); |
308 | if (p) |
309 | { |
310 | *p = ','; |
311 | matches = p + 1; |
312 | } |
313 | else |
314 | matches = NULL; |
315 | } |
316 | |
317 | return ret; |
318 | } |
319 | |
320 | static int |
321 | mb_test (const char *pattern, int cflags, const char *string, int eflags, |
322 | char *expect, const char *matches, const char *letters, |
323 | const char *fail) |
324 | { |
325 | char *pattern_mb = mb_frob_pattern (str: pattern, letters); |
326 | const char *string_mb |
327 | = eflags == -1 ? string : mb_frob_string (str: string, letters); |
328 | char *expect_mb = mb_frob_string (str: expect, letters); |
329 | char *matches_mb = mb_frob_string (str: matches, letters); |
330 | int ret = 0; |
331 | |
332 | if (!pattern_mb || !string_mb |
333 | || (expect && !expect_mb) || (matches && !matches_mb)) |
334 | { |
335 | printf (format: "%s %m" , fail); |
336 | ret = 1; |
337 | } |
338 | else |
339 | ret = test (pattern: pattern_mb, cflags, string: string_mb, eflags, expect: expect_mb, |
340 | matches: matches_mb, fail); |
341 | |
342 | free (ptr: matches_mb); |
343 | free (ptr: expect_mb); |
344 | if (string_mb != string) |
345 | free (ptr: (char *) string_mb); |
346 | free (ptr: pattern_mb); |
347 | return ret; |
348 | } |
349 | |
350 | static int |
351 | mb_tests (const char *pattern, int cflags, const char *string, int eflags, |
352 | char *expect, const char *matches) |
353 | { |
354 | int ret = 0; |
355 | int i; |
356 | char letters[9], fail[20]; |
357 | |
358 | /* The tests aren't supposed to work with xdigit, since a-dA-D are |
359 | hex digits while \'a \'A \v{c}\v{C}\v{d}\v{D}\'e \'E are not. */ |
360 | if (strstr (pattern, "[:xdigit:]" )) |
361 | return 0; |
362 | |
363 | /* XXX: regex ATM handles only single byte equivalence classes. */ |
364 | if (strstr (pattern, "[[=b=]]" )) |
365 | return 0; |
366 | |
367 | for (i = 1; i < 16; ++i) |
368 | { |
369 | char *p = letters; |
370 | if (i & 1) |
371 | { |
372 | if (!strchr (pattern, 'a') && !strchr (string, 'a') |
373 | && !strchr (pattern, 'A') && !strchr (string, 'A')) |
374 | continue; |
375 | *p++ = 'a', *p++ = 'A'; |
376 | } |
377 | if (i & 2) |
378 | { |
379 | if (!strchr (pattern, 'b') && !strchr (string, 'b') |
380 | && !strchr (pattern, 'B') && !strchr (string, 'B')) |
381 | continue; |
382 | *p++ = 'b', *p++ = 'B'; |
383 | } |
384 | if (i & 4) |
385 | { |
386 | if (!strchr (pattern, 'c') && !strchr (string, 'c') |
387 | && !strchr (pattern, 'C') && !strchr (string, 'C')) |
388 | continue; |
389 | *p++ = 'c', *p++ = 'C'; |
390 | } |
391 | if (i & 8) |
392 | { |
393 | if (!strchr (pattern, 'd') && !strchr (string, 'd') |
394 | && !strchr (pattern, 'D') && !strchr (string, 'D')) |
395 | continue; |
396 | *p++ = 'd', *p++ = 'D'; |
397 | } |
398 | *p++ = '\0'; |
399 | sprintf (fail, "UTF-8 %s FAIL" , letters); |
400 | ret |= mb_test (pattern, cflags, string, eflags, expect, matches, |
401 | letters, fail); |
402 | } |
403 | return ret; |
404 | } |
405 | |
406 | int |
407 | main (int argc, char **argv) |
408 | { |
409 | int ret = 0; |
410 | char *line = NULL; |
411 | size_t line_len = 0; |
412 | ssize_t len; |
413 | FILE *f; |
414 | static int test_utf8 = 0; |
415 | static const struct option options[] = |
416 | { |
417 | {"utf8" , no_argument, &test_utf8, 1}, |
418 | {NULL, 0, NULL, 0 } |
419 | }; |
420 | |
421 | mtrace (); |
422 | |
423 | while (getopt_long (argc: argc, argv: argv, shortopts: "" , longopts: options, NULL) >= 0); |
424 | |
425 | if (optind + 1 != argc) |
426 | { |
427 | fprintf (stderr, "Missing test filename\n" ); |
428 | return 1; |
429 | } |
430 | |
431 | f = fopen (argv[optind], "r" ); |
432 | if (f == NULL) |
433 | { |
434 | fprintf (stderr, "Couldn't open %s\n" , argv[optind]); |
435 | return 1; |
436 | } |
437 | |
438 | while ((len = getline (lineptr: &line, n: &line_len, stream: f)) > 0) |
439 | { |
440 | char *pattern, *flagstr, *string, *expect, *matches, *p; |
441 | int cflags = REG_EXTENDED, eflags = 0, try_bre_ere = 0; |
442 | |
443 | if (line[len - 1] == '\n') |
444 | line[len - 1] = '\0'; |
445 | |
446 | /* Skip comments and empty lines. */ |
447 | if (*line == '#' || *line == '\0') |
448 | continue; |
449 | |
450 | puts (s: line); |
451 | fflush (stdout); |
452 | |
453 | pattern = strtok (s: line, delim: "\t" ); |
454 | if (pattern == NULL) |
455 | continue; |
456 | |
457 | if (strcmp (pattern, "\"\"" ) == 0) |
458 | pattern += 2; |
459 | |
460 | flagstr = strtok (NULL, delim: "\t" ); |
461 | if (flagstr == NULL) |
462 | continue; |
463 | |
464 | string = strtok (NULL, delim: "\t" ); |
465 | if (string == NULL) |
466 | continue; |
467 | |
468 | if (strcmp (string, "\"\"" ) == 0) |
469 | string += 2; |
470 | |
471 | for (p = flagstr; *p; ++p) |
472 | switch (*p) |
473 | { |
474 | case '-': |
475 | break; |
476 | case 'b': |
477 | cflags &= ~REG_EXTENDED; |
478 | break; |
479 | case '&': |
480 | try_bre_ere = 1; |
481 | break; |
482 | case 'C': |
483 | eflags = -1; |
484 | break; |
485 | case 'i': |
486 | cflags |= REG_ICASE; |
487 | break; |
488 | case 's': |
489 | cflags |= REG_NOSUB; |
490 | break; |
491 | case 'n': |
492 | cflags |= REG_NEWLINE; |
493 | break; |
494 | case '^': |
495 | eflags |= REG_NOTBOL; |
496 | break; |
497 | case '$': |
498 | eflags |= REG_NOTEOL; |
499 | break; |
500 | case 'm': |
501 | case 'p': |
502 | case '#': |
503 | /* Not supported. */ |
504 | flagstr = NULL; |
505 | break; |
506 | } |
507 | |
508 | if (flagstr == NULL) |
509 | continue; |
510 | |
511 | replace_special_chars (str: pattern); |
512 | glibc_re_syntax (str: pattern); |
513 | if (eflags != -1) |
514 | replace_special_chars (str: string); |
515 | |
516 | expect = strtok (NULL, delim: "\t" ); |
517 | matches = NULL; |
518 | if (expect != NULL) |
519 | { |
520 | replace_special_chars (str: expect); |
521 | matches = strtok (NULL, delim: "\t" ); |
522 | if (matches != NULL) |
523 | replace_special_chars (str: matches); |
524 | } |
525 | |
526 | if (setlocale (LC_ALL, "C" ) == NULL) |
527 | { |
528 | puts (s: "setlocale C failed" ); |
529 | ret = 1; |
530 | } |
531 | if (test (pattern, cflags, string, eflags, expect, matches, fail: "FAIL" ) |
532 | || (try_bre_ere |
533 | && test (pattern, cflags: cflags & ~REG_EXTENDED, string, eflags, |
534 | expect, matches, fail: "FAIL" ))) |
535 | ret = 1; |
536 | else if (test_utf8) |
537 | { |
538 | if (setlocale (LC_ALL, "cs_CZ.UTF-8" ) == NULL) |
539 | { |
540 | puts (s: "setlocale cs_CZ.UTF-8 failed" ); |
541 | ret = 1; |
542 | } |
543 | else if (test (pattern, cflags, string, eflags, expect, matches, |
544 | fail: "UTF-8 FAIL" ) |
545 | || (try_bre_ere |
546 | && test (pattern, cflags: cflags & ~REG_EXTENDED, string, |
547 | eflags, expect, matches, fail: "UTF-8 FAIL" ))) |
548 | ret = 1; |
549 | else if (mb_tests (pattern, cflags, string, eflags, expect, matches) |
550 | || (try_bre_ere |
551 | && mb_tests (pattern, cflags: cflags & ~REG_EXTENDED, string, |
552 | eflags, expect, matches))) |
553 | ret = 1; |
554 | } |
555 | } |
556 | |
557 | free (ptr: line); |
558 | fclose (f); |
559 | return ret; |
560 | } |
561 | |