1 | /* Copyright (C) 2001-2022 Free Software Foundation, Inc. |
2 | This file is part of the GNU C Library. |
3 | |
4 | The GNU C Library is free software; you can redistribute it and/or |
5 | modify it under the terms of the GNU Lesser General Public |
6 | License as published by the Free Software Foundation; either |
7 | version 2.1 of the License, or (at your option) any later version. |
8 | |
9 | The GNU C Library is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | Lesser General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU Lesser General Public |
15 | License along with the GNU C Library; if not, see |
16 | <https://www.gnu.org/licenses/>. */ |
17 | |
18 | #include <assert.h> |
19 | #include <errno.h> |
20 | #include <error.h> |
21 | #include <fcntl.h> |
22 | #include <getopt.h> |
23 | #include <iconv.h> |
24 | #include <locale.h> |
25 | #include <mcheck.h> |
26 | #include <stdint.h> |
27 | #include <stdio.h> |
28 | #include <stdlib.h> |
29 | #include <string.h> |
30 | #include <time.h> |
31 | #include <unistd.h> |
32 | #include <sys/stat.h> |
33 | #include <sys/types.h> |
34 | #include <regex.h> |
35 | #include <support/support.h> |
36 | |
37 | |
38 | #if defined _POSIX_CPUTIME && _POSIX_CPUTIME >= 0 |
39 | static clockid_t cl; |
40 | static int use_clock; |
41 | #endif |
42 | static iconv_t cd; |
43 | static char *mem; |
44 | static char *umem; |
45 | static size_t memlen; |
46 | static size_t umemlen; |
47 | static int timing; |
48 | |
49 | static int test_expr (const char *expr, int expected, int expectedicase); |
50 | static int run_test (const char *expr, const char *mem, size_t memlen, |
51 | int icase, int expected); |
52 | static int run_test_backwards (const char *expr, const char *mem, |
53 | size_t memlen, int icase, int expected); |
54 | |
55 | |
56 | static int |
57 | do_test (void) |
58 | { |
59 | const char *file; |
60 | int fd; |
61 | struct stat st; |
62 | int result = 0; |
63 | char *inmem; |
64 | char *outmem; |
65 | size_t inlen; |
66 | size_t outlen; |
67 | |
68 | mtrace (); |
69 | |
70 | /* Make the content of the file available in memory. */ |
71 | file = "./tst-regex.input" ; |
72 | fd = open (file: file, O_RDONLY); |
73 | if (fd == -1) |
74 | error (EXIT_FAILURE, errno, format: "cannot open %s" , basename (file)); |
75 | |
76 | if (fstat (fd: fd, buf: &st) != 0) |
77 | error (EXIT_FAILURE, errno, format: "cannot stat %s" , basename (file)); |
78 | memlen = st.st_size; |
79 | |
80 | mem = (char *) malloc (size: memlen + 1); |
81 | if (mem == NULL) |
82 | error (EXIT_FAILURE, errno, format: "while allocating buffer" ); |
83 | |
84 | if ((size_t) read (fd, mem, memlen) != memlen) |
85 | error (EXIT_FAILURE, errnum: 0, format: "cannot read entire file" ); |
86 | mem[memlen] = '\0'; |
87 | |
88 | close (fd: fd); |
89 | |
90 | /* We have to convert a few things from UTF-8 to Latin-1. */ |
91 | cd = iconv_open (tocode: "ISO-8859-1" , fromcode: "UTF-8" ); |
92 | if (cd == (iconv_t) -1) |
93 | error (EXIT_FAILURE, errno, format: "cannot get conversion descriptor" ); |
94 | |
95 | /* For the second test we have to convert the file content to Latin-1. |
96 | This cannot grow the data. */ |
97 | umem = (char *) malloc (size: memlen + 1); |
98 | if (umem == NULL) |
99 | error (EXIT_FAILURE, errno, format: "while allocating buffer" ); |
100 | |
101 | inmem = mem; |
102 | inlen = memlen; |
103 | outmem = umem; |
104 | outlen = memlen; |
105 | iconv (cd: cd, inbuf: &inmem, inbytesleft: &inlen, outbuf: &outmem, outbytesleft: &outlen); |
106 | umemlen = outmem - umem; |
107 | if (inlen != 0) |
108 | error (EXIT_FAILURE, errno, format: "cannot convert buffer" ); |
109 | umem[umemlen] = '\0'; |
110 | |
111 | #if defined _POSIX_CPUTIME && _POSIX_CPUTIME >= 0 |
112 | # if _POSIX_CPUTIME == 0 |
113 | if (sysconf (_SC_CPUTIME) < 0) |
114 | use_clock = 0; |
115 | else |
116 | # endif |
117 | /* See whether we can use the CPU clock. */ |
118 | use_clock = clock_getcpuclockid (pid: 0, clock_id: &cl) == 0; |
119 | #endif |
120 | |
121 | #ifdef DEBUG |
122 | re_set_syntax (RE_DEBUG); |
123 | #endif |
124 | |
125 | /* Run the actual tests. All tests are run in a single-byte and a |
126 | multi-byte locale. */ |
127 | result |= test_expr (expr: "[äáàâéèêíìîñöóòôüúùû]" , expected: 4, expectedicase: 4); |
128 | result |= test_expr (expr: "G.ran" , expected: 2, expectedicase: 3); |
129 | result |= test_expr (expr: "G.\\{1\\}ran" , expected: 2, expectedicase: 3); |
130 | result |= test_expr (expr: "G.*ran" , expected: 3, expectedicase: 44); |
131 | result |= test_expr (expr: "[äáàâ]" , expected: 0, expectedicase: 0); |
132 | result |= test_expr (expr: "Uddeborg" , expected: 2, expectedicase: 2); |
133 | result |= test_expr (expr: ".Uddeborg" , expected: 2, expectedicase: 2); |
134 | |
135 | /* Free the resources. */ |
136 | free (ptr: umem); |
137 | iconv_close (cd: cd); |
138 | free (ptr: mem); |
139 | |
140 | return result; |
141 | } |
142 | |
143 | |
144 | static int |
145 | test_expr (const char *expr, int expected, int expectedicase) |
146 | { |
147 | int result = 0; |
148 | char *inmem; |
149 | char *outmem; |
150 | size_t inlen; |
151 | size_t outlen; |
152 | char *uexpr; |
153 | |
154 | /* First test: search with basic C.UTF-8 locale. */ |
155 | printf (format: "INFO: Testing C.UTF-8.\n" ); |
156 | xsetlocale (LC_ALL, locale: "C.UTF-8" ); |
157 | |
158 | printf (format: "\nTest \"%s\" with multi-byte locale\n" , expr); |
159 | result |= run_test (expr, mem, memlen, icase: 0, expected); |
160 | printf (format: "\nTest \"%s\" with multi-byte locale, case insensitive\n" , expr); |
161 | result |= run_test (expr, mem, memlen, icase: 1, expected: expectedicase); |
162 | printf (format: "\nTest \"%s\" backwards with multi-byte locale\n" , expr); |
163 | result |= run_test_backwards (expr, mem, memlen, icase: 0, expected); |
164 | printf (format: "\nTest \"%s\" backwards with multi-byte locale, case insensitive\n" , |
165 | expr); |
166 | result |= run_test_backwards (expr, mem, memlen, icase: 1, expected: expectedicase); |
167 | |
168 | /* Second test: search with an UTF-8 locale. */ |
169 | printf (format: "INFO: Testing de_DE.UTF-8.\n" ); |
170 | xsetlocale (LC_ALL, locale: "de_DE.UTF-8" ); |
171 | |
172 | printf (format: "\nTest \"%s\" with multi-byte locale\n" , expr); |
173 | result |= run_test (expr, mem, memlen, icase: 0, expected); |
174 | printf (format: "\nTest \"%s\" with multi-byte locale, case insensitive\n" , expr); |
175 | result |= run_test (expr, mem, memlen, icase: 1, expected: expectedicase); |
176 | printf (format: "\nTest \"%s\" backwards with multi-byte locale\n" , expr); |
177 | result |= run_test_backwards (expr, mem, memlen, icase: 0, expected); |
178 | printf (format: "\nTest \"%s\" backwards with multi-byte locale, case insensitive\n" , |
179 | expr); |
180 | result |= run_test_backwards (expr, mem, memlen, icase: 1, expected: expectedicase); |
181 | |
182 | /* Second test: search with an ISO-8859-1 locale. */ |
183 | printf (format: "INFO: Testing de_DE.ISO-8859-1.\n" ); |
184 | xsetlocale (LC_ALL, locale: "de_DE.ISO-8859-1" ); |
185 | |
186 | inmem = (char *) expr; |
187 | inlen = strlen (expr); |
188 | outlen = inlen; |
189 | outmem = uexpr = alloca (outlen + 1); |
190 | memset (outmem, '\0', outlen + 1); |
191 | iconv (cd: cd, inbuf: &inmem, inbytesleft: &inlen, outbuf: &outmem, outbytesleft: &outlen); |
192 | if (inlen != 0) |
193 | error (EXIT_FAILURE, errno, format: "cannot convert expression" ); |
194 | |
195 | /* Run the tests. */ |
196 | printf (format: "\nTest \"%s\" with 8-bit locale\n" , expr); |
197 | result |= run_test (expr: uexpr, mem: umem, memlen: umemlen, icase: 0, expected); |
198 | printf (format: "\nTest \"%s\" with 8-bit locale, case insensitive\n" , expr); |
199 | result |= run_test (expr: uexpr, mem: umem, memlen: umemlen, icase: 1, expected: expectedicase); |
200 | printf (format: "\nTest \"%s\" backwards with 8-bit locale\n" , expr); |
201 | result |= run_test_backwards (expr: uexpr, mem: umem, memlen: umemlen, icase: 0, expected); |
202 | printf (format: "\nTest \"%s\" backwards with 8-bit locale, case insensitive\n" , |
203 | expr); |
204 | result |= run_test_backwards (expr: uexpr, mem: umem, memlen: umemlen, icase: 1, expected: expectedicase); |
205 | |
206 | return result; |
207 | } |
208 | |
209 | |
210 | static int |
211 | run_test (const char *expr, const char *mem, size_t memlen, int icase, |
212 | int expected) |
213 | { |
214 | #if defined _POSIX_CPUTIME && _POSIX_CPUTIME >= 0 |
215 | struct timespec start; |
216 | struct timespec finish; |
217 | #endif |
218 | regex_t re; |
219 | int err; |
220 | size_t offset; |
221 | int cnt; |
222 | |
223 | #if defined _POSIX_CPUTIME && _POSIX_CPUTIME >= 0 |
224 | if (use_clock && !timing) |
225 | use_clock = clock_gettime (clock_id: cl, tp: &start) == 0; |
226 | #endif |
227 | |
228 | err = regcomp (preg: &re, pattern: expr, REG_NEWLINE | (icase ? REG_ICASE : 0)); |
229 | if (err != REG_NOERROR) |
230 | { |
231 | char buf[200]; |
232 | regerror (errcode: err, preg: &re, errbuf: buf, errbuf_size: sizeof buf); |
233 | error (EXIT_FAILURE, errnum: 0, format: "cannot compile expression: %s" , buf); |
234 | } |
235 | |
236 | cnt = 0; |
237 | offset = 0; |
238 | assert (mem[memlen] == '\0'); |
239 | while (offset < memlen) |
240 | { |
241 | regmatch_t ma[1]; |
242 | const char *sp; |
243 | const char *ep; |
244 | |
245 | err = regexec (preg: &re, String: mem + offset, nmatch: 1, pmatch: ma, eflags: 0); |
246 | if (err == REG_NOMATCH) |
247 | break; |
248 | |
249 | if (err != REG_NOERROR) |
250 | { |
251 | char buf[200]; |
252 | regerror (errcode: err, preg: &re, errbuf: buf, errbuf_size: sizeof buf); |
253 | error (EXIT_FAILURE, errnum: 0, format: "cannot use expression: %s" , buf); |
254 | } |
255 | |
256 | assert (ma[0].rm_so >= 0); |
257 | sp = mem + offset + ma[0].rm_so; |
258 | while (sp > mem && sp[-1] != '\n') |
259 | --sp; |
260 | |
261 | ep = mem + offset + ma[0].rm_so; |
262 | while (*ep != '\0' && *ep != '\n') |
263 | ++ep; |
264 | |
265 | printf (format: "match %d: \"%.*s\"\n" , ++cnt, (int) (ep - sp), sp); |
266 | |
267 | offset = ep + 1 - mem; |
268 | } |
269 | |
270 | regfree (preg: &re); |
271 | |
272 | #if defined _POSIX_CPUTIME && _POSIX_CPUTIME >= 0 |
273 | if (use_clock && !timing) |
274 | { |
275 | use_clock = clock_gettime (clock_id: cl, tp: &finish) == 0; |
276 | if (use_clock) |
277 | { |
278 | if (finish.tv_nsec < start.tv_nsec) |
279 | { |
280 | finish.tv_nsec -= start.tv_nsec - 1000000000; |
281 | finish.tv_sec -= 1 + start.tv_sec; |
282 | } |
283 | else |
284 | { |
285 | finish.tv_nsec -= start.tv_nsec; |
286 | finish.tv_sec -= start.tv_sec; |
287 | } |
288 | |
289 | printf (format: "elapsed time: %jd.%09jd sec\n" , |
290 | (intmax_t) finish.tv_sec, (intmax_t) finish.tv_nsec); |
291 | } |
292 | } |
293 | |
294 | if (use_clock && timing) |
295 | { |
296 | struct timespec mintime = { .tv_sec = 24 * 60 * 60 }; |
297 | |
298 | for (int i = 0; i < 10; ++i) |
299 | { |
300 | offset = 0; |
301 | use_clock = clock_gettime (clock_id: cl, tp: &start) == 0; |
302 | |
303 | if (!use_clock) |
304 | continue; |
305 | |
306 | err = regcomp (preg: &re, pattern: expr, REG_NEWLINE | (icase ? REG_ICASE : 0)); |
307 | if (err != REG_NOERROR) |
308 | continue; |
309 | |
310 | while (offset < memlen) |
311 | { |
312 | regmatch_t ma[1]; |
313 | |
314 | err = regexec (preg: &re, String: mem + offset, nmatch: 1, pmatch: ma, eflags: 0); |
315 | if (err != REG_NOERROR) |
316 | break; |
317 | |
318 | offset += ma[0].rm_eo; |
319 | } |
320 | |
321 | regfree (preg: &re); |
322 | |
323 | use_clock = clock_gettime (clock_id: cl, tp: &finish) == 0; |
324 | if (use_clock) |
325 | { |
326 | if (finish.tv_nsec < start.tv_nsec) |
327 | { |
328 | finish.tv_nsec -= start.tv_nsec - 1000000000; |
329 | finish.tv_sec -= 1 + start.tv_sec; |
330 | } |
331 | else |
332 | { |
333 | finish.tv_nsec -= start.tv_nsec; |
334 | finish.tv_sec -= start.tv_sec; |
335 | } |
336 | if (finish.tv_sec < mintime.tv_sec |
337 | || (finish.tv_sec == mintime.tv_sec |
338 | && finish.tv_nsec < mintime.tv_nsec)) |
339 | mintime = finish; |
340 | } |
341 | } |
342 | printf (format: "elapsed time: %jd.%09jd sec\n" , |
343 | (intmax_t) mintime.tv_sec, (intmax_t) mintime.tv_nsec); |
344 | } |
345 | #endif |
346 | |
347 | /* Return an error if the number of matches found is not match we |
348 | expect. */ |
349 | return cnt != expected; |
350 | } |
351 | |
352 | |
353 | static int |
354 | run_test_backwards (const char *expr, const char *mem, size_t memlen, |
355 | int icase, int expected) |
356 | { |
357 | #if defined _POSIX_CPUTIME && _POSIX_CPUTIME >= 0 |
358 | struct timespec start; |
359 | struct timespec finish; |
360 | #endif |
361 | struct re_pattern_buffer re; |
362 | const char *err; |
363 | size_t offset; |
364 | int cnt; |
365 | |
366 | #if defined _POSIX_CPUTIME && _POSIX_CPUTIME >= 0 |
367 | if (use_clock && !timing) |
368 | use_clock = clock_gettime (clock_id: cl, tp: &start) == 0; |
369 | #endif |
370 | |
371 | re_set_syntax (syntax: (RE_SYNTAX_POSIX_BASIC & ~RE_DOT_NEWLINE) |
372 | | RE_HAT_LISTS_NOT_NEWLINE |
373 | | (icase ? RE_ICASE : 0)); |
374 | |
375 | memset (&re, 0, sizeof (re)); |
376 | re.fastmap = malloc (size: 256); |
377 | if (re.fastmap == NULL) |
378 | error (EXIT_FAILURE, errno, format: "cannot allocate fastmap" ); |
379 | |
380 | err = re_compile_pattern (pattern: expr, length: strlen (expr), buffer: &re); |
381 | if (err != NULL) |
382 | error (EXIT_FAILURE, errnum: 0, format: "cannot compile expression: %s" , err); |
383 | |
384 | if (re_compile_fastmap (buffer: &re)) |
385 | error (EXIT_FAILURE, errnum: 0, format: "couldn't compile fastmap" ); |
386 | |
387 | cnt = 0; |
388 | offset = memlen; |
389 | assert (mem[memlen] == '\0'); |
390 | while (offset <= memlen) |
391 | { |
392 | int start; |
393 | const char *sp; |
394 | const char *ep; |
395 | |
396 | start = re_search (buffer: &re, String: mem, length: memlen, start: offset, range: -offset, NULL); |
397 | if (start == -1) |
398 | break; |
399 | |
400 | if (start == -2) |
401 | error (EXIT_FAILURE, errnum: 0, format: "internal error in re_search" ); |
402 | |
403 | sp = mem + start; |
404 | while (sp > mem && sp[-1] != '\n') |
405 | --sp; |
406 | |
407 | ep = mem + start; |
408 | while (*ep != '\0' && *ep != '\n') |
409 | ++ep; |
410 | |
411 | printf (format: "match %d: \"%.*s\"\n" , ++cnt, (int) (ep - sp), sp); |
412 | |
413 | offset = sp - 1 - mem; |
414 | } |
415 | |
416 | regfree (preg: &re); |
417 | |
418 | #if defined _POSIX_CPUTIME && _POSIX_CPUTIME >= 0 |
419 | if (use_clock && !timing) |
420 | { |
421 | use_clock = clock_gettime (clock_id: cl, tp: &finish) == 0; |
422 | if (use_clock) |
423 | { |
424 | if (finish.tv_nsec < start.tv_nsec) |
425 | { |
426 | finish.tv_nsec -= start.tv_nsec - 1000000000; |
427 | finish.tv_sec -= 1 + start.tv_sec; |
428 | } |
429 | else |
430 | { |
431 | finish.tv_nsec -= start.tv_nsec; |
432 | finish.tv_sec -= start.tv_sec; |
433 | } |
434 | |
435 | printf (format: "elapsed time: %jd.%09jd sec\n" , |
436 | (intmax_t) finish.tv_sec, (intmax_t) finish.tv_nsec); |
437 | } |
438 | } |
439 | |
440 | if (use_clock && timing) |
441 | { |
442 | struct timespec mintime = { .tv_sec = 24 * 60 * 60 }; |
443 | |
444 | for (int i = 0; i < 10; ++i) |
445 | { |
446 | offset = memlen; |
447 | use_clock = clock_gettime (clock_id: cl, tp: &start) == 0; |
448 | |
449 | if (!use_clock) |
450 | continue; |
451 | |
452 | memset (&re, 0, sizeof (re)); |
453 | re.fastmap = malloc (size: 256); |
454 | if (re.fastmap == NULL) |
455 | continue; |
456 | |
457 | err = re_compile_pattern (pattern: expr, length: strlen (expr), buffer: &re); |
458 | if (err != NULL) |
459 | continue; |
460 | |
461 | if (re_compile_fastmap (buffer: &re)) |
462 | { |
463 | regfree (preg: &re); |
464 | continue; |
465 | } |
466 | |
467 | while (offset <= memlen) |
468 | { |
469 | int start; |
470 | const char *sp; |
471 | |
472 | start = re_search (buffer: &re, String: mem, length: memlen, start: offset, range: -offset, NULL); |
473 | if (start < -1) |
474 | break; |
475 | |
476 | sp = mem + start; |
477 | while (sp > mem && sp[-1] != '\n') |
478 | --sp; |
479 | |
480 | offset = sp - 1 - mem; |
481 | } |
482 | |
483 | regfree (preg: &re); |
484 | |
485 | use_clock = clock_gettime (clock_id: cl, tp: &finish) == 0; |
486 | if (use_clock) |
487 | { |
488 | if (finish.tv_nsec < start.tv_nsec) |
489 | { |
490 | finish.tv_nsec -= start.tv_nsec - 1000000000; |
491 | finish.tv_sec -= 1 + start.tv_sec; |
492 | } |
493 | else |
494 | { |
495 | finish.tv_nsec -= start.tv_nsec; |
496 | finish.tv_sec -= start.tv_sec; |
497 | } |
498 | if (finish.tv_sec < mintime.tv_sec |
499 | || (finish.tv_sec == mintime.tv_sec |
500 | && finish.tv_nsec < mintime.tv_nsec)) |
501 | mintime = finish; |
502 | } |
503 | } |
504 | printf (format: "elapsed time: %jd.%09jd sec\n" , |
505 | (intmax_t) mintime.tv_sec, (intmax_t) mintime.tv_nsec); |
506 | } |
507 | #endif |
508 | |
509 | /* Return an error if the number of matches found is not match we |
510 | expect. */ |
511 | return cnt != expected; |
512 | } |
513 | |
514 | /* If --timing is used we will need a larger timout. */ |
515 | #define TIMEOUT 50 |
516 | #define CMDLINE_OPTIONS \ |
517 | {"timing", no_argument, &timing, 1 }, |
518 | #define TEST_FUNCTION do_test () |
519 | #include "../test-skeleton.c" |
520 | |