1 | // class template regex -*- C++ -*- |
2 | |
3 | // Copyright (C) 2013-2021 Free Software Foundation, Inc. |
4 | // |
5 | // This file is part of the GNU ISO C++ Library. This library is free |
6 | // software; you can redistribute it and/or modify it under the |
7 | // terms of the GNU General Public License as published by the |
8 | // Free Software Foundation; either version 3, or (at your option) |
9 | // any later version. |
10 | |
11 | // This library is distributed in the hope that it will be useful, |
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | // GNU General Public License for more details. |
15 | |
16 | // Under Section 7 of GPL version 3, you are granted additional |
17 | // permissions described in the GCC Runtime Library Exception, version |
18 | // 3.1, as published by the Free Software Foundation. |
19 | |
20 | // You should have received a copy of the GNU General Public License and |
21 | // a copy of the GCC Runtime Library Exception along with this program; |
22 | // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
23 | // <http://www.gnu.org/licenses/>. |
24 | |
25 | /** |
26 | * @file bits/regex_scanner.tcc |
27 | * This is an internal header file, included by other library headers. |
28 | * Do not attempt to use it directly. @headername{regex} |
29 | */ |
30 | |
31 | // FIXME make comments doxygen format. |
32 | |
33 | // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep |
34 | // and awk |
35 | // 1) grep is basic except '\n' is treated as '|' |
36 | // 2) egrep is extended except '\n' is treated as '|' |
37 | // 3) awk is extended except special escaping rules, and there's no |
38 | // back-reference. |
39 | // |
40 | // References: |
41 | // |
42 | // ECMAScript: ECMA-262 15.10 |
43 | // |
44 | // basic, extended: |
45 | // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html |
46 | // |
47 | // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html |
48 | |
49 | namespace std _GLIBCXX_VISIBILITY(default) |
50 | { |
51 | _GLIBCXX_BEGIN_NAMESPACE_VERSION |
52 | |
53 | namespace __detail |
54 | { |
55 | template<typename _CharT> |
56 | _Scanner<_CharT>:: |
57 | _Scanner(const _CharT* __begin, const _CharT* __end, |
58 | _FlagT __flags, std::locale __loc) |
59 | : _ScannerBase(__flags), |
60 | _M_current(__begin), _M_end(__end), |
61 | _M_ctype(std::use_facet<_CtypeT>(__loc)), |
62 | _M_eat_escape(_M_is_ecma() |
63 | ? &_Scanner::_M_eat_escape_ecma |
64 | : &_Scanner::_M_eat_escape_posix) |
65 | { _M_advance(); } |
66 | |
67 | template<typename _CharT> |
68 | void |
69 | _Scanner<_CharT>:: |
70 | _M_advance() |
71 | { |
72 | if (_M_current == _M_end) |
73 | { |
74 | _M_token = _S_token_eof; |
75 | return; |
76 | } |
77 | |
78 | if (_M_state == _S_state_normal) |
79 | _M_scan_normal(); |
80 | else if (_M_state == _S_state_in_bracket) |
81 | _M_scan_in_bracket(); |
82 | else if (_M_state == _S_state_in_brace) |
83 | _M_scan_in_brace(); |
84 | else |
85 | { |
86 | __glibcxx_assert(false); |
87 | } |
88 | } |
89 | |
90 | // Differences between styles: |
91 | // 1) "\(", "\)", "\{" in basic. It's not escaping. |
92 | // 2) "(?:", "(?=", "(?!" in ECMAScript. |
93 | template<typename _CharT> |
94 | void |
95 | _Scanner<_CharT>:: |
96 | _M_scan_normal() |
97 | { |
98 | auto __c = *_M_current++; |
99 | |
100 | if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr) |
101 | { |
102 | _M_token = _S_token_ord_char; |
103 | _M_value.assign(1, __c); |
104 | return; |
105 | } |
106 | if (__c == '\\') |
107 | { |
108 | if (_M_current == _M_end) |
109 | __throw_regex_error( |
110 | ecode: regex_constants::error_escape, |
111 | what: "Unexpected end of regex when escaping." ); |
112 | |
113 | if (!_M_is_basic() |
114 | || (*_M_current != '(' |
115 | && *_M_current != ')' |
116 | && *_M_current != '{')) |
117 | { |
118 | (this->*_M_eat_escape)(); |
119 | return; |
120 | } |
121 | __c = *_M_current++; |
122 | } |
123 | if (__c == '(') |
124 | { |
125 | if (_M_is_ecma() && *_M_current == '?') |
126 | { |
127 | if (++_M_current == _M_end) |
128 | __throw_regex_error( |
129 | ecode: regex_constants::error_paren, |
130 | what: "Unexpected end of regex when in an open parenthesis." ); |
131 | |
132 | if (*_M_current == ':') |
133 | { |
134 | ++_M_current; |
135 | _M_token = _S_token_subexpr_no_group_begin; |
136 | } |
137 | else if (*_M_current == '=') |
138 | { |
139 | ++_M_current; |
140 | _M_token = _S_token_subexpr_lookahead_begin; |
141 | _M_value.assign(1, 'p'); |
142 | } |
143 | else if (*_M_current == '!') |
144 | { |
145 | ++_M_current; |
146 | _M_token = _S_token_subexpr_lookahead_begin; |
147 | _M_value.assign(1, 'n'); |
148 | } |
149 | else |
150 | __throw_regex_error( |
151 | ecode: regex_constants::error_paren, |
152 | what: "Invalid special open parenthesis." ); |
153 | } |
154 | else if (_M_flags & regex_constants::nosubs) |
155 | _M_token = _S_token_subexpr_no_group_begin; |
156 | else |
157 | _M_token = _S_token_subexpr_begin; |
158 | } |
159 | else if (__c == ')') |
160 | _M_token = _S_token_subexpr_end; |
161 | else if (__c == '[') |
162 | { |
163 | _M_state = _S_state_in_bracket; |
164 | _M_at_bracket_start = true; |
165 | if (_M_current != _M_end && *_M_current == '^') |
166 | { |
167 | _M_token = _S_token_bracket_neg_begin; |
168 | ++_M_current; |
169 | } |
170 | else |
171 | _M_token = _S_token_bracket_begin; |
172 | } |
173 | else if (__c == '{') |
174 | { |
175 | _M_state = _S_state_in_brace; |
176 | _M_token = _S_token_interval_begin; |
177 | } |
178 | else if (__builtin_expect(__c == _CharT(0), false)) |
179 | { |
180 | if (!_M_is_ecma()) |
181 | { |
182 | __throw_regex_error(ecode: regex_constants::_S_null, |
183 | what: "Unexpected null character in regular expression" ); |
184 | } |
185 | _M_token = _S_token_ord_char; |
186 | _M_value.assign(1, __c); |
187 | } |
188 | else if (__c != ']' && __c != '}') |
189 | { |
190 | auto __it = _M_token_tbl; |
191 | auto __narrowc = _M_ctype.narrow(__c, '\0'); |
192 | for (; __it->first != '\0'; ++__it) |
193 | if (__it->first == __narrowc) |
194 | { |
195 | _M_token = __it->second; |
196 | return; |
197 | } |
198 | __glibcxx_assert(false); |
199 | } |
200 | else |
201 | { |
202 | _M_token = _S_token_ord_char; |
203 | _M_value.assign(1, __c); |
204 | } |
205 | } |
206 | |
207 | // Differences between styles: |
208 | // 1) different semantics of "[]" and "[^]". |
209 | // 2) Escaping in bracket expr. |
210 | template<typename _CharT> |
211 | void |
212 | _Scanner<_CharT>:: |
213 | _M_scan_in_bracket() |
214 | { |
215 | if (_M_current == _M_end) |
216 | __throw_regex_error( |
217 | ecode: regex_constants::error_brack, |
218 | what: "Unexpected end of regex when in bracket expression." ); |
219 | |
220 | auto __c = *_M_current++; |
221 | |
222 | if (__c == '-') |
223 | _M_token = _S_token_bracket_dash; |
224 | else if (__c == '[') |
225 | { |
226 | if (_M_current == _M_end) |
227 | __throw_regex_error(ecode: regex_constants::error_brack, |
228 | what: "Unexpected character class open bracket." ); |
229 | |
230 | if (*_M_current == '.') |
231 | { |
232 | _M_token = _S_token_collsymbol; |
233 | _M_eat_class(*_M_current++); |
234 | } |
235 | else if (*_M_current == ':') |
236 | { |
237 | _M_token = _S_token_char_class_name; |
238 | _M_eat_class(*_M_current++); |
239 | } |
240 | else if (*_M_current == '=') |
241 | { |
242 | _M_token = _S_token_equiv_class_name; |
243 | _M_eat_class(*_M_current++); |
244 | } |
245 | else |
246 | { |
247 | _M_token = _S_token_ord_char; |
248 | _M_value.assign(1, __c); |
249 | } |
250 | } |
251 | // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted |
252 | // literally. So "[]]" and "[^]]" are valid regexes. See the testcases |
253 | // `*/empty_range.cc`. |
254 | else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start)) |
255 | { |
256 | _M_token = _S_token_bracket_end; |
257 | _M_state = _S_state_normal; |
258 | } |
259 | // ECMAScript and awk permits escaping in bracket. |
260 | else if (__c == '\\' && (_M_is_ecma() || _M_is_awk())) |
261 | (this->*_M_eat_escape)(); |
262 | else |
263 | { |
264 | _M_token = _S_token_ord_char; |
265 | _M_value.assign(1, __c); |
266 | } |
267 | _M_at_bracket_start = false; |
268 | } |
269 | |
270 | // Differences between styles: |
271 | // 1) "\}" in basic style. |
272 | template<typename _CharT> |
273 | void |
274 | _Scanner<_CharT>:: |
275 | _M_scan_in_brace() |
276 | { |
277 | if (_M_current == _M_end) |
278 | __throw_regex_error( |
279 | ecode: regex_constants::error_brace, |
280 | what: "Unexpected end of regex when in brace expression." ); |
281 | |
282 | auto __c = *_M_current++; |
283 | |
284 | if (_M_ctype.is(_CtypeT::digit, __c)) |
285 | { |
286 | _M_token = _S_token_dup_count; |
287 | _M_value.assign(1, __c); |
288 | while (_M_current != _M_end |
289 | && _M_ctype.is(_CtypeT::digit, *_M_current)) |
290 | _M_value += *_M_current++; |
291 | } |
292 | else if (__c == ',') |
293 | _M_token = _S_token_comma; |
294 | // basic use \}. |
295 | else if (_M_is_basic()) |
296 | { |
297 | if (__c == '\\' && _M_current != _M_end && *_M_current == '}') |
298 | { |
299 | _M_state = _S_state_normal; |
300 | _M_token = _S_token_interval_end; |
301 | ++_M_current; |
302 | } |
303 | else |
304 | __throw_regex_error(ecode: regex_constants::error_badbrace, |
305 | what: "Unexpected character in brace expression." ); |
306 | } |
307 | else if (__c == '}') |
308 | { |
309 | _M_state = _S_state_normal; |
310 | _M_token = _S_token_interval_end; |
311 | } |
312 | else |
313 | __throw_regex_error(ecode: regex_constants::error_badbrace, |
314 | what: "Unexpected character in brace expression." ); |
315 | } |
316 | |
317 | template<typename _CharT> |
318 | void |
319 | _Scanner<_CharT>:: |
320 | _M_eat_escape_ecma() |
321 | { |
322 | if (_M_current == _M_end) |
323 | __throw_regex_error(ecode: regex_constants::error_escape, |
324 | what: "Unexpected end of regex when escaping." ); |
325 | |
326 | auto __c = *_M_current++; |
327 | auto __pos = _M_find_escape(c: _M_ctype.narrow(__c, '\0')); |
328 | |
329 | if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket)) |
330 | { |
331 | _M_token = _S_token_ord_char; |
332 | _M_value.assign(1, *__pos); |
333 | } |
334 | else if (__c == 'b') |
335 | { |
336 | _M_token = _S_token_word_bound; |
337 | _M_value.assign(1, 'p'); |
338 | } |
339 | else if (__c == 'B') |
340 | { |
341 | _M_token = _S_token_word_bound; |
342 | _M_value.assign(1, 'n'); |
343 | } |
344 | // N3376 28.13 |
345 | else if (__c == 'd' |
346 | || __c == 'D' |
347 | || __c == 's' |
348 | || __c == 'S' |
349 | || __c == 'w' |
350 | || __c == 'W') |
351 | { |
352 | _M_token = _S_token_quoted_class; |
353 | _M_value.assign(1, __c); |
354 | } |
355 | else if (__c == 'c') |
356 | { |
357 | if (_M_current == _M_end) |
358 | __throw_regex_error( |
359 | ecode: regex_constants::error_escape, |
360 | what: "Unexpected end of regex when reading control code." ); |
361 | _M_token = _S_token_ord_char; |
362 | _M_value.assign(1, *_M_current++); |
363 | } |
364 | else if (__c == 'x' || __c == 'u') |
365 | { |
366 | _M_value.erase(); |
367 | for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++) |
368 | { |
369 | if (_M_current == _M_end |
370 | || !_M_ctype.is(_CtypeT::xdigit, *_M_current)) |
371 | __throw_regex_error( |
372 | ecode: regex_constants::error_escape, |
373 | what: "Unexpected end of regex when ascii character." ); |
374 | _M_value += *_M_current++; |
375 | } |
376 | _M_token = _S_token_hex_num; |
377 | } |
378 | // ECMAScript recognizes multi-digit back-references. |
379 | else if (_M_ctype.is(_CtypeT::digit, __c)) |
380 | { |
381 | _M_value.assign(1, __c); |
382 | while (_M_current != _M_end |
383 | && _M_ctype.is(_CtypeT::digit, *_M_current)) |
384 | _M_value += *_M_current++; |
385 | _M_token = _S_token_backref; |
386 | } |
387 | else |
388 | { |
389 | _M_token = _S_token_ord_char; |
390 | _M_value.assign(1, __c); |
391 | } |
392 | } |
393 | |
394 | // Differences between styles: |
395 | // 1) Extended doesn't support backref, but basic does. |
396 | template<typename _CharT> |
397 | void |
398 | _Scanner<_CharT>:: |
399 | _M_eat_escape_posix() |
400 | { |
401 | if (_M_current == _M_end) |
402 | __throw_regex_error(ecode: regex_constants::error_escape, |
403 | what: "Unexpected end of regex when escaping." ); |
404 | |
405 | auto __c = *_M_current; |
406 | auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')); |
407 | |
408 | if (__pos != nullptr && *__pos != '\0') |
409 | { |
410 | _M_token = _S_token_ord_char; |
411 | _M_value.assign(1, __c); |
412 | } |
413 | // We MUST judge awk before handling backrefs. There's no backref in awk. |
414 | else if (_M_is_awk()) |
415 | { |
416 | _M_eat_escape_awk(); |
417 | return; |
418 | } |
419 | else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0') |
420 | { |
421 | _M_token = _S_token_backref; |
422 | _M_value.assign(1, __c); |
423 | } |
424 | else |
425 | { |
426 | #ifdef __STRICT_ANSI__ |
427 | // POSIX says it is undefined to escape ordinary characters |
428 | __throw_regex_error(regex_constants::error_escape, |
429 | "Unexpected escape character." ); |
430 | #else |
431 | _M_token = _S_token_ord_char; |
432 | _M_value.assign(1, __c); |
433 | #endif |
434 | } |
435 | ++_M_current; |
436 | } |
437 | |
438 | template<typename _CharT> |
439 | void |
440 | _Scanner<_CharT>:: |
441 | _M_eat_escape_awk() |
442 | { |
443 | auto __c = *_M_current++; |
444 | auto __pos = _M_find_escape(c: _M_ctype.narrow(__c, '\0')); |
445 | |
446 | if (__pos != nullptr) |
447 | { |
448 | _M_token = _S_token_ord_char; |
449 | _M_value.assign(1, *__pos); |
450 | } |
451 | // \ddd for oct representation |
452 | else if (_M_ctype.is(_CtypeT::digit, __c) |
453 | && __c != '8' |
454 | && __c != '9') |
455 | { |
456 | _M_value.assign(1, __c); |
457 | for (int __i = 0; |
458 | __i < 2 |
459 | && _M_current != _M_end |
460 | && _M_ctype.is(_CtypeT::digit, *_M_current) |
461 | && *_M_current != '8' |
462 | && *_M_current != '9'; |
463 | __i++) |
464 | _M_value += *_M_current++; |
465 | _M_token = _S_token_oct_num; |
466 | return; |
467 | } |
468 | else |
469 | __throw_regex_error(ecode: regex_constants::error_escape, |
470 | what: "Unexpected escape character." ); |
471 | } |
472 | |
473 | // Eats a character class or throws an exception. |
474 | // __ch could be ':', '.' or '=', _M_current is the char after ']' when |
475 | // returning. |
476 | template<typename _CharT> |
477 | void |
478 | _Scanner<_CharT>:: |
479 | _M_eat_class(char __ch) |
480 | { |
481 | for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;) |
482 | _M_value += *_M_current++; |
483 | if (_M_current == _M_end |
484 | || *_M_current++ != __ch |
485 | || _M_current == _M_end // skip __ch |
486 | || *_M_current++ != ']') // skip ']' |
487 | { |
488 | if (__ch == ':') |
489 | __throw_regex_error(ecode: regex_constants::error_ctype, |
490 | what: "Unexpected end of character class." ); |
491 | else |
492 | __throw_regex_error(ecode: regex_constants::error_collate, |
493 | what: "Unexpected end of character class." ); |
494 | } |
495 | } |
496 | |
497 | #ifdef _GLIBCXX_DEBUG |
498 | template<typename _CharT> |
499 | std::ostream& |
500 | _Scanner<_CharT>:: |
501 | _M_print(std::ostream& ostr) |
502 | { |
503 | switch (_M_token) |
504 | { |
505 | case _S_token_anychar: |
506 | ostr << "any-character\n" ; |
507 | break; |
508 | case _S_token_backref: |
509 | ostr << "backref\n" ; |
510 | break; |
511 | case _S_token_bracket_begin: |
512 | ostr << "bracket-begin\n" ; |
513 | break; |
514 | case _S_token_bracket_neg_begin: |
515 | ostr << "bracket-neg-begin\n" ; |
516 | break; |
517 | case _S_token_bracket_end: |
518 | ostr << "bracket-end\n" ; |
519 | break; |
520 | case _S_token_char_class_name: |
521 | ostr << "char-class-name \"" << _M_value << "\"\n" ; |
522 | break; |
523 | case _S_token_closure0: |
524 | ostr << "closure0\n" ; |
525 | break; |
526 | case _S_token_closure1: |
527 | ostr << "closure1\n" ; |
528 | break; |
529 | case _S_token_collsymbol: |
530 | ostr << "collsymbol \"" << _M_value << "\"\n" ; |
531 | break; |
532 | case _S_token_comma: |
533 | ostr << "comma\n" ; |
534 | break; |
535 | case _S_token_dup_count: |
536 | ostr << "dup count: " << _M_value << "\n" ; |
537 | break; |
538 | case _S_token_eof: |
539 | ostr << "EOF\n" ; |
540 | break; |
541 | case _S_token_equiv_class_name: |
542 | ostr << "equiv-class-name \"" << _M_value << "\"\n" ; |
543 | break; |
544 | case _S_token_interval_begin: |
545 | ostr << "interval begin\n" ; |
546 | break; |
547 | case _S_token_interval_end: |
548 | ostr << "interval end\n" ; |
549 | break; |
550 | case _S_token_line_begin: |
551 | ostr << "line begin\n" ; |
552 | break; |
553 | case _S_token_line_end: |
554 | ostr << "line end\n" ; |
555 | break; |
556 | case _S_token_opt: |
557 | ostr << "opt\n" ; |
558 | break; |
559 | case _S_token_or: |
560 | ostr << "or\n" ; |
561 | break; |
562 | case _S_token_ord_char: |
563 | ostr << "ordinary character: \"" << _M_value << "\"\n" ; |
564 | break; |
565 | case _S_token_subexpr_begin: |
566 | ostr << "subexpr begin\n" ; |
567 | break; |
568 | case _S_token_subexpr_no_group_begin: |
569 | ostr << "no grouping subexpr begin\n" ; |
570 | break; |
571 | case _S_token_subexpr_lookahead_begin: |
572 | ostr << "lookahead subexpr begin\n" ; |
573 | break; |
574 | case _S_token_subexpr_end: |
575 | ostr << "subexpr end\n" ; |
576 | break; |
577 | case _S_token_unknown: |
578 | ostr << "-- unknown token --\n" ; |
579 | break; |
580 | case _S_token_oct_num: |
581 | ostr << "oct number " << _M_value << "\n" ; |
582 | break; |
583 | case _S_token_hex_num: |
584 | ostr << "hex number " << _M_value << "\n" ; |
585 | break; |
586 | case _S_token_quoted_class: |
587 | ostr << "quoted class " << "\\" << _M_value << "\n" ; |
588 | break; |
589 | default: |
590 | _GLIBCXX_DEBUG_ASSERT(false); |
591 | } |
592 | return ostr; |
593 | } |
594 | #endif |
595 | |
596 | } // namespace __detail |
597 | _GLIBCXX_END_NAMESPACE_VERSION |
598 | } // namespace |
599 | |