1 | /* Test for UTF-8 regular expression optimizations. |
2 | Copyright (C) 2003-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sys/types.h> |
20 | #include <mcheck.h> |
21 | #include <regex.h> |
22 | #include <stdio.h> |
23 | #include <stdlib.h> |
24 | #include <string.h> |
25 | #include <locale.h> |
26 | |
27 | #define RE_NO_INTERNAL_PROTOTYPES 1 |
28 | #include "regex_internal.h" |
29 | |
30 | #define BRE RE_SYNTAX_POSIX_BASIC |
31 | #define ERE RE_SYNTAX_POSIX_EXTENDED |
32 | |
33 | static struct |
34 | { |
35 | int syntax; |
36 | const char *pattern; |
37 | const char *string; |
38 | int res, optimize; |
39 | } tests[] = { |
40 | /* \xc3\x84 LATIN CAPITAL LETTER A WITH DIAERESIS |
41 | \xc3\x96 LATIN CAPITAL LETTER O WITH DIAERESIS |
42 | \xc3\xa4 LATIN SMALL LETTER A WITH DIAERESIS |
43 | \xc3\xb6 LATIN SMALL LETTER O WITH DIAERESIS |
44 | \xe2\x80\x94 EM DASH */ |
45 | /* Should be optimized. */ |
46 | {BRE, "foo" , "b\xc3\xa4rfoob\xc3\xa4z" , 4, 1}, |
47 | {BRE, "b\xc3\xa4z" , "b\xc3\xa4rfoob\xc3\xa4z" , 7, 1}, |
48 | {BRE, "b\xc3\xa4*z" , "b\xc3\xa4rfoob\xc3\xa4z" , 7, 1}, |
49 | {BRE, "b\xc3\xa4*z" , "b\xc3\xa4rfoobz" , 7, 1}, |
50 | {BRE, "b\xc3\xa4\\+z" , "b\xc3\xa4rfoob\xc3\xa4\xc3\xa4z" , 7, 1}, |
51 | {BRE, "b\xc3\xa4\\?z" , "b\xc3\xa4rfoob\xc3\xa4z" , 7, 1}, |
52 | {BRE, "b\xc3\xa4\\{1,2\\}z" , "b\xc3\xa4rfoob\xc3\xa4z" , 7, 1}, |
53 | {BRE, "^x\\|xy*z$" , "\xc3\xb6xyyz" , 2, 1}, |
54 | {BRE, "^x\\\\y\\{6\\}z\\+" , "x\\yyyyyyzz\xc3\xb6" , 0, 1}, |
55 | {BRE, "^x\\\\y\\{2,36\\}z\\+" , "x\\yzz\xc3\xb6" , -1, 1}, |
56 | {BRE, "^x\\\\y\\{,3\\}z\\+" , "x\\yyyzz\xc3\xb6" , 0, 1}, |
57 | {BRE, "^x\\|x\xc3\xa4*z$" , "\xc3\xb6x\xc3\xa4\xc3\xa4z" , 2, 1}, |
58 | {BRE, "^x\\\\\xc3\x84\\{6\\}z\\+" , |
59 | "x\\\xc3\x84\xc3\x84\xc3\x84\xc3\x84\xc3\x84\xc3\x84zz\xc3\xb6" , 0, 1}, |
60 | {BRE, "^x\\\\\xc3\x84\\{2,36\\}z\\+" , "x\\\xc3\x84zz\xc3\xb6" , -1, 1}, |
61 | {BRE, "^x\\\\\xc3\x84\\{,3\\}z\\+" , |
62 | "x\\\xc3\x84\xc3\x84\xc3\x84zz\xc3\xb6" , 0, 1}, |
63 | {BRE, "x[C]y" , "axCy" , 1, 1}, |
64 | {BRE, "x[ABC]y" , "axCy" , 1, 1}, |
65 | {BRE, "\\`x\\|z\\'" , "x\xe2\x80\x94" , 0, 1}, |
66 | {BRE, "\\(xy\\)z\\1a\\1" , "\xe2\x80\x94xyzxyaxy\xc3\x84" , 3, 1}, |
67 | {BRE, "xy\\?z" , "\xc3\x84xz\xc3\xb6" , 2, 1}, |
68 | {BRE, "\\`\xc3\x84\\|z\\'" , "\xc3\x84\xe2\x80\x94" , 0, 1}, |
69 | {BRE, "\\(x\xc3\x84\\)z\\1\x61\\1" , |
70 | "\xe2\x80\x94x\xc3\x84zx\xc3\x84\x61x\xc3\x84\xc3\x96" , 3, 1}, |
71 | {BRE, "x\xc3\x96\\?z" , "\xc3\x84xz\xc3\xb6" , 2, 1}, |
72 | {BRE, "x.y" , "ax\xe2\x80\x94yz" , 1, 1}, |
73 | {BRE, "x.*z" , "\xc3\x84xz" , 2, 1}, |
74 | {BRE, "x.*z" , "\xc3\x84x\xe2\x80\x94z" , 2, 1}, |
75 | {BRE, "x.*z" , "\xc3\x84x\xe2\x80\x94y\xf1\x90\x80\x90z" , 2, 1}, |
76 | {BRE, "x.*z" , "\xc3\x84x\xe2\x80\x94\xc3\x94\xf1\x90\x80\x90z" , 2, 1}, |
77 | {BRE, "x.\\?z" , "axz" , 1, 1}, |
78 | {BRE, "x.\\?z" , "axyz" , 1, 1}, |
79 | {BRE, "x.\\?z" , "ax\xc3\x84z" , 1, 1}, |
80 | {BRE, "x.\\?z" , "ax\xe2\x80\x94z" , 1, 1}, |
81 | {BRE, "x.\\?z" , "ax\xf0\x9d\x80\x80z" , 1, 1}, |
82 | {BRE, "x.\\?z" , "ax\xf9\x81\x82\x83\x84z" , 1, 1}, |
83 | {BRE, "x.\\?z" , "ax\xfd\xbf\xbf\xbf\xbf\xbfz" , 1, 1}, |
84 | {BRE, "." , "y" , 0, 1}, |
85 | {BRE, "." , "\xc3\x84" , 0, 1}, |
86 | {BRE, "." , "\xe2\x80\x94" , 0, 1}, |
87 | {BRE, "." , "\xf0\x9d\x80\x80" , 0, 1}, |
88 | {BRE, "." , "\xf9\x81\x82\x83\x84" , 0, 1}, |
89 | {BRE, "." , "\xfd\xbf\xbf\xbf\xbf\xbf" , 0, 1}, |
90 | {BRE, "x.\\?z" , "axyyz" , -1, 1}, |
91 | {BRE, "x.\\?z" , "ax\xc3\x84\xc3\x96z" , -1, 1}, |
92 | {BRE, "x.\\?z" , "ax\xe2\x80\x94\xc3\xa4z" , -1, 1}, |
93 | {BRE, "x.\\?z" , "ax\xf0\x9d\x80\x80yz" , -1, 1}, |
94 | {BRE, "x.\\?z" , "ax\xf9\x81\x82\x83\x84\xf0\x9d\x80\x81z" , -1, 1}, |
95 | {BRE, "x.\\?z" , "ax\xfd\xbf\xbf\xbf\xbf\xbf\xc3\x96z" , -1, 1}, |
96 | {BRE, "x.\\+z" , "\xe2\x80\x94xz" , -1, 1}, |
97 | {BRE, "x.\\+z" , "\xe2\x80\x94xyz" , 3, 1}, |
98 | {BRE, "x.\\+z" , "\xe2\x80\x94x\xc3\x84y\xe2\x80\x94z" , 3, 1}, |
99 | {BRE, "x.\\+z" , "\xe2\x80\x94x\xe2\x80\x94z" , 3, 1}, |
100 | {BRE, "x.\\+z" , "\xe2\x80\x94x\xf0\x9d\x80\x80\xc3\x84z" , 3, 1}, |
101 | {BRE, "x.\\+z" , "\xe2\x80\x94x.~\xe2\x80\x94\xf9\x81\x82\x83\x84z" , 3, 1}, |
102 | {BRE, "x.\\+z" , "\xe2\x80\x94x\xfd\xbf\xbf\xbf\xbf\xbfz" , 3, 1}, |
103 | {BRE, "x.\\{1,2\\}z" , "\xe2\x80\x94xz" , -1, 1}, |
104 | {BRE, "x.\\{1,2\\}z" , "\xe2\x80\x94x\xc3\x96y\xc3\xa4z" , -1, 1}, |
105 | {BRE, "x.\\{1,2\\}z" , "\xe2\x80\x94xyz" , 3, 1}, |
106 | {BRE, "x.\\{1,2\\}z" , "\xe2\x80\x94x\xc3\x84\xe2\x80\x94z" , 3, 1}, |
107 | {BRE, "x.\\{1,2\\}z" , "\xe2\x80\x94x\xe2\x80\x94z" , 3, 1}, |
108 | {BRE, "x.\\{1,2\\}z" , "\xe2\x80\x94x\xf0\x9d\x80\x80\xc3\x84z" , 3, 1}, |
109 | {BRE, "x.\\{1,2\\}z" , "\xe2\x80\x94x~\xe2\x80\x94z" , 3, 1}, |
110 | {BRE, "x.\\{1,2\\}z" , "\xe2\x80\x94x\xfd\xbf\xbf\xbf\xbf\xbfz" , 3, 1}, |
111 | {BRE, "x\\(.w\\|\xc3\x86\\)\\?z" , "axz" , 1, 1}, |
112 | {BRE, "x\\(.w\\|\xc3\x86\\)\\?z" , "ax\xfd\xbf\xbf\xbf\xbf\xbfwz" , 1, 1}, |
113 | {BRE, "x\\(.w\\|\xc3\x86\\)\\?z" , "ax\xc3\x86z" , 1, 1}, |
114 | {BRE, "x\\(.w\\|\xc3\x86\\)\\?z" , "ax\xe2\x80\x96wz" , 1, 1}, |
115 | {ERE, "foo" , "b\xc3\xa4rfoob\xc3\xa4z" , 4, 1}, |
116 | {ERE, "^x|xy*z$" , "\xc3\xb6xyyz" , 2, 1}, |
117 | {ERE, "^x\\\\y{6}z+" , "x\\yyyyyyzz\xc3\xb6" , 0, 1}, |
118 | {ERE, "^x\\\\y{2,36}z+" , "x\\yzz\xc3\xb6" , -1, 1}, |
119 | {ERE, "^x\\\\y{,3}z+" , "x\\yyyzz\xc3\xb6" , 0, 1}, |
120 | {ERE, "x[C]y" , "axCy" , 1, 1}, |
121 | {ERE, "x[ABC]y" , "axCy" , 1, 1}, |
122 | {ERE, "\\`x|z\\'" , "x\xe2\x80\x94" , 0, 1}, |
123 | {ERE, "(xy)z\\1a\\1" , "\xe2\x80\x94xyzxyaxy\xc3\x84" , 3, 1}, |
124 | {ERE, "xy?z" , "\xc3\x84xz\xc3\xb6" , 2, 1}, |
125 | {ERE, "x.y" , "ax\xe2\x80\x94yz" , 1, 1}, |
126 | {ERE, "x.*z" , "\xc3\x84xz" , 2, 1}, |
127 | {ERE, "x.*z" , "\xc3\x84x\xe2\x80\x94z" , 2, 1}, |
128 | {ERE, "x.*z" , "\xc3\x84x\xe2\x80\x94y\xf1\x90\x80\x90z" , 2, 1}, |
129 | {ERE, "x.*z" , "\xc3\x84x\xe2\x80\x94\xc3\x94\xf1\x90\x80\x90z" , 2, 1}, |
130 | {ERE, "x.?z" , "axz" , 1, 1}, |
131 | {ERE, "x.?z" , "axyz" , 1, 1}, |
132 | {ERE, "x.?z" , "ax\xc3\x84z" , 1, 1}, |
133 | {ERE, "x.?z" , "ax\xe2\x80\x94z" , 1, 1}, |
134 | {ERE, "x.?z" , "ax\xf0\x9d\x80\x80z" , 1, 1}, |
135 | {ERE, "x.?z" , "ax\xf9\x81\x82\x83\x84z" , 1, 1}, |
136 | {ERE, "x.?z" , "ax\xfd\xbf\xbf\xbf\xbf\xbfz" , 1, 1}, |
137 | {ERE, "x.?z" , "axyyz" , -1, 1}, |
138 | {ERE, "x.?z" , "ax\xc3\x84\xc3\x96z" , -1, 1}, |
139 | {ERE, "x.?z" , "ax\xe2\x80\x94\xc3\xa4z" , -1, 1}, |
140 | {ERE, "x.?z" , "ax\xf0\x9d\x80\x80yz" , -1, 1}, |
141 | {ERE, "x.?z" , "ax\xf9\x81\x82\x83\x84\xf0\x9d\x80\x81z" , -1, 1}, |
142 | {ERE, "x.?z" , "ax\xfd\xbf\xbf\xbf\xbf\xbf\xc3\x96z" , -1, 1}, |
143 | {ERE, "x.+z" , "\xe2\x80\x94xz" , -1, 1}, |
144 | {ERE, "x.+z" , "\xe2\x80\x94xyz" , 3, 1}, |
145 | {ERE, "x.+z" , "\xe2\x80\x94x\xc3\x84y\xe2\x80\x94z" , 3, 1}, |
146 | {ERE, "x.+z" , "\xe2\x80\x94x\xe2\x80\x94z" , 3, 1}, |
147 | {ERE, "x.+z" , "\xe2\x80\x94x\xf0\x9d\x80\x80\xc3\x84z" , 3, 1}, |
148 | {ERE, "x.+z" , "\xe2\x80\x94x.~\xe2\x80\x94\xf9\x81\x82\x83\x84z" , 3, 1}, |
149 | {ERE, "x.+z" , "\xe2\x80\x94x\xfd\xbf\xbf\xbf\xbf\xbfz" , 3, 1}, |
150 | {ERE, "x.{1,2}z" , "\xe2\x80\x94xz" , -1, 1}, |
151 | {ERE, "x.{1,2}z" , "\xe2\x80\x94x\xc3\x96y\xc3\xa4z" , -1, 1}, |
152 | {ERE, "x.{1,2}z" , "\xe2\x80\x94xyz" , 3, 1}, |
153 | {ERE, "x.{1,2}z" , "\xe2\x80\x94x\xc3\x84\xe2\x80\x94z" , 3, 1}, |
154 | {ERE, "x.{1,2}z" , "\xe2\x80\x94x\xe2\x80\x94z" , 3, 1}, |
155 | {ERE, "x.{1,2}z" , "\xe2\x80\x94x\xf0\x9d\x80\x80\xc3\x84z" , 3, 1}, |
156 | {ERE, "x.{1,2}z" , "\xe2\x80\x94x~\xe2\x80\x94z" , 3, 1}, |
157 | {ERE, "x.{1,2}z" , "\xe2\x80\x94x\xfd\xbf\xbf\xbf\xbf\xbfz" , 3, 1}, |
158 | {ERE, "x(.w|\xc3\x86)?z" , "axz" , 1, 1}, |
159 | {ERE, "x(.w|\xc3\x86)?z" , "ax\xfd\xbf\xbf\xbf\xbf\xbfwz" , 1, 1}, |
160 | {ERE, "x(.w|\xc3\x86)?z" , "ax\xc3\x86z" , 1, 1}, |
161 | {ERE, "x(.w|\xc3\x86)?z" , "ax\xe2\x80\x96wz" , 1, 1}, |
162 | /* Should not be optimized. */ |
163 | {BRE, "x[\xc3\x84\xc3\xa4]y" , "ax\xc3\xa4y" , 1, 0}, |
164 | {BRE, "x[A-Z,]y" , "axCy" , 1, 0}, |
165 | {BRE, "x[^y]z" , "ax\xe2\x80\x94z" , 1, 0}, |
166 | {BRE, "x[[:alnum:]]z" , "ax\xc3\x96z" , 1, 0}, |
167 | {BRE, "x[[=A=]]z" , "axAz" , 1, 0}, |
168 | {BRE, "x[[=\xc3\x84=]]z" , "ax\xc3\x84z" , 1, 0}, |
169 | {BRE, "\\<g" , "\xe2\x80\x94g" , 3, 0}, |
170 | {BRE, "\\bg\\b" , "\xe2\x80\x94g" , 3, 0}, |
171 | {BRE, "\\Bg\\B" , "\xc3\xa4g\xc3\xa4" , 2, 0}, |
172 | {BRE, "a\\wz" , "a\xc3\x84z" , 0, 0}, |
173 | {BRE, "x\\Wz" , "\xc3\x96x\xe2\x80\x94z" , 2, 0}, |
174 | {ERE, "x[\xc3\x84\xc3\xa4]y" , "ax\xc3\xa4y" , 1, 0}, |
175 | {ERE, "x[A-Z,]y" , "axCy" , 1, 0}, |
176 | {ERE, "x[^y]z" , "ax\xe2\x80\x94z" , 1, 0}, |
177 | {ERE, "x[[:alnum:]]z" , "ax\xc3\x96z" , 1, 0}, |
178 | {ERE, "x[[=A=]]z" , "axAz" , 1, 0}, |
179 | {ERE, "x[[=\xc3\x84=]]z" , "ax\xc3\x84z" , 1, 0}, |
180 | {ERE, "\\<g" , "\xe2\x80\x94g" , 3, 0}, |
181 | {ERE, "\\bg\\b" , "\xe2\x80\x94g" , 3, 0}, |
182 | {ERE, "\\Bg\\B" , "\xc3\xa4g\xc3\xa4" , 2, 0}, |
183 | {ERE, "a\\wz" , "a\xc3\x84z" , 0, 0}, |
184 | {ERE, "x\\Wz" , "\xc3\x96x\xe2\x80\x94z" , 2, 0}, |
185 | }; |
186 | |
187 | int |
188 | main (void) |
189 | { |
190 | struct re_pattern_buffer regbuf; |
191 | const char *err; |
192 | size_t i; |
193 | int ret = 0; |
194 | |
195 | mtrace (); |
196 | |
197 | setlocale (LC_ALL, "de_DE.UTF-8" ); |
198 | for (i = 0; i < sizeof (tests) / sizeof (tests[0]); ++i) |
199 | { |
200 | int res, optimized; |
201 | |
202 | re_set_syntax (syntax: tests[i].syntax); |
203 | memset (®buf, '\0', sizeof (regbuf)); |
204 | err = re_compile_pattern (pattern: tests[i].pattern, length: strlen (tests[i].pattern), |
205 | buffer: ®buf); |
206 | if (err != NULL) |
207 | { |
208 | printf (format: "re_compile_pattern failed: %s\n" , err); |
209 | ret = 1; |
210 | continue; |
211 | } |
212 | |
213 | /* Check if re_search will be done as multi-byte or single-byte. */ |
214 | optimized = ((re_dfa_t *) regbuf.buffer)->mb_cur_max == 1; |
215 | if (optimized != tests[i].optimize) |
216 | { |
217 | printf (format: "pattern %zd %soptimized while it should%s be\n" , |
218 | i, optimized ? "" : "not " , tests[i].optimize ? "" : " not" ); |
219 | ret = 1; |
220 | } |
221 | |
222 | int str_len = strlen (tests[i].string); |
223 | res = re_search (buffer: ®buf, String: tests[i].string, length: str_len, start: 0, range: str_len, NULL); |
224 | if (res != tests[i].res) |
225 | { |
226 | printf (format: "re_search %zd failed: %d\n" , i, res); |
227 | ret = 1; |
228 | regfree (preg: ®buf); |
229 | continue; |
230 | } |
231 | |
232 | res = re_search (buffer: ®buf, String: tests[i].string, length: str_len, start: str_len, range: -str_len, |
233 | NULL); |
234 | if (res != tests[i].res) |
235 | { |
236 | printf (format: "backward re_search %zd failed: %d\n" , i, res); |
237 | ret = 1; |
238 | regfree (preg: ®buf); |
239 | continue; |
240 | } |
241 | regfree (preg: ®buf); |
242 | |
243 | re_set_syntax (syntax: tests[i].syntax | RE_ICASE); |
244 | memset (®buf, '\0', sizeof (regbuf)); |
245 | err = re_compile_pattern (pattern: tests[i].pattern, length: strlen (tests[i].pattern), |
246 | buffer: ®buf); |
247 | if (err != NULL) |
248 | { |
249 | printf (format: "re_compile_pattern failed: %s\n" , err); |
250 | ret = 1; |
251 | continue; |
252 | } |
253 | |
254 | /* Check if re_search will be done as multi-byte or single-byte. */ |
255 | optimized = ((re_dfa_t *) regbuf.buffer)->mb_cur_max == 1; |
256 | if (optimized) |
257 | { |
258 | printf (format: "pattern %zd optimized while it should not be when case insensitive\n" , |
259 | i); |
260 | ret = 1; |
261 | } |
262 | |
263 | res = re_search (buffer: ®buf, String: tests[i].string, length: str_len, start: 0, range: str_len, NULL); |
264 | if (res != tests[i].res) |
265 | { |
266 | printf (format: "ICASE re_search %zd failed: %d\n" , i, res); |
267 | ret = 1; |
268 | regfree (preg: ®buf); |
269 | continue; |
270 | } |
271 | |
272 | res = re_search (buffer: ®buf, String: tests[i].string, length: str_len, start: str_len, range: -str_len, |
273 | NULL); |
274 | if (res != tests[i].res) |
275 | { |
276 | printf (format: "ICASE backward re_search %zd failed: %d\n" , i, res); |
277 | ret = 1; |
278 | regfree (preg: ®buf); |
279 | continue; |
280 | } |
281 | regfree (preg: ®buf); |
282 | } |
283 | |
284 | return ret; |
285 | } |
286 | |