regex_compiler.tcc source code [include/c++/11/bits/regex_compiler.tcc]

1	// class template regex -- C++ --
2
3	// Copyright (C) 2013-2021 Free Software Foundation, Inc.
4	//
5	// This file is part of the GNU ISO C++ Library. This library is free
6	// software; you can redistribute it and/or modify it under the
7	// terms of the GNU General Public License as published by the
8	// Free Software Foundation; either version 3, or (at your option)
9	// any later version.
10
11	// This library is distributed in the hope that it will be useful,
12	// but WITHOUT ANY WARRANTY; without even the implied warranty of
13	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	// GNU General Public License for more details.
15
16	// Under Section 7 of GPL version 3, you are granted additional
17	// permissions described in the GCC Runtime Library Exception, version
18	// 3.1, as published by the Free Software Foundation.
19
20	// You should have received a copy of the GNU General Public License and
21	// a copy of the GCC Runtime Library Exception along with this program;
22	// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23	// <http://www.gnu.org/licenses/>.
24
25	/**
26	* @file bits/regex_compiler.tcc
27	* This is an internal header file, included by other library headers.
28	* Do not attempt to use it directly. @headername{regex}
29	*/
30
31	// FIXME make comments doxygen format.
32
33	/*
34	// This compiler refers to "Regular Expression Matching Can Be Simple And Fast"
35	// (http://swtch.com/~rsc/regexp/regexp1.html),
36	// but doesn't strictly follow it.
37	//
38	// When compiling, states are chained* instead of tree- or graph-constructed.*
39	// It's more like structured programs: there's if statement and loop statement.
40	//
41	// For alternative structure (say "a\|b"), aka "if statement", two branches
42	// should be constructed. However, these two shall merge to an "end_tag" at
43	// the end of this operator:
44	//
45	// branch1
46	// / \
47	// => begin_tag end_tag =>
48	// \ /
49	// branch2
50	//
51	// This is the difference between this implementation and that in Russ's
52	// article.
53	//
54	// That's why we introduced dummy node here ------ "end_tag" is a dummy node.
55	// All dummy nodes will be eliminated at the end of compilation.
56	*/
57
58	namespace std _GLIBCXX_VISIBILITY(default)
59	{
60	_GLIBCXX_BEGIN_NAMESPACE_VERSION
61
62	namespace __detail
63	{
64	template<typename _TraitsT>
65	_Compiler<_TraitsT>::
66	_Compiler(const _CharT* __b, const _CharT* __e,
67	const typename _TraitsT::locale_type& __loc, _FlagT __flags)
68	: _M_flags(_S_validate(f: __flags)),
69	_M_scanner(__b, __e, _M_flags, __loc),
70	_M_nfa(make_shared<_RegexT>(__loc, _M_flags)),
71	_M_traits(_M_nfa->_M_traits),
72	_M_ctype(std::use_facet<_CtypeT>(__loc))
73	{
74	_StateSeqT __r(*_M_nfa, _M_nfa->_M_start());
75	__r._M_append(_M_nfa->_M_insert_subexpr_begin());
76	this->_M_disjunction();
77	if (!_M_match_token(token: _ScannerT::_S_token_eof))
78	__throw_regex_error(ecode: regex_constants::error_paren);
79	__r._M_append(_M_pop());
80	__glibcxx_assert(_M_stack.empty());
81	__r._M_append(_M_nfa->_M_insert_subexpr_end());
82	__r._M_append(_M_nfa->_M_insert_accept());
83	_M_nfa->_M_eliminate_dummy();
84	}
85
86	template<typename _TraitsT>
87	void
88	_Compiler<_TraitsT>::
89	_M_disjunction()
90	{
91	this->_M_alternative();
92	while (_M_match_token(token: _ScannerT::_S_token_or))
93	{
94	_StateSeqT __alt1 = _M_pop();
95	this->_M_alternative();
96	_StateSeqT __alt2 = _M_pop();
97	auto __end = _M_nfa->_M_insert_dummy();
98	__alt1._M_append(__end);
99	__alt2._M_append(__end);
100	// __alt2 is state._M_next, __alt1 is state._M_alt. The executor
101	// executes _M_alt before _M_next, as well as executing left
102	// alternative before right one.
103	_M_stack.push(_StateSeqT(*_M_nfa,
104	_M_nfa->_M_insert_alt(
105	__alt2._M_start, __alt1._M_start, false),
106	__end));
107	}
108	}
109
110	template<typename _TraitsT>
111	void
112	_Compiler<_TraitsT>::
113	_M_alternative()
114	{
115	if (this->_M_term())
116	{
117	_StateSeqT __re = _M_pop();
118	this->_M_alternative();
119	__re._M_append(_M_pop());
120	_M_stack.push(__re);
121	}
122	else
123	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_dummy()));
124	}
125
126	template<typename _TraitsT>
127	bool
128	_Compiler<_TraitsT>::
129	_M_term()
130	{
131	if (this->_M_assertion())
132	return true;
133	if (this->_M_atom())
134	{
135	while (this->_M_quantifier())
136	;
137	return true;
138	}
139	return false;
140	}
141
142	template<typename _TraitsT>
143	bool
144	_Compiler<_TraitsT>::
145	_M_assertion()
146	{
147	if (_M_match_token(token: _ScannerT::_S_token_line_begin))
148	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_begin()));
149	else if (_M_match_token(token: _ScannerT::_S_token_line_end))
150	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_end()));
151	else if (_M_match_token(token: _ScannerT::_S_token_word_bound))
152	// _M_value[0] == 'n' means it's negative, say "not word boundary".
153	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->
154	_M_insert_word_bound(_M_value[`0`] == `'n'`)));
155	else if (_M_match_token(token: _ScannerT::_S_token_subexpr_lookahead_begin))
156	{
157	auto __neg = _M_value[`0`] == `'n'`;
158	this->_M_disjunction();
159	if (!_M_match_token(token: _ScannerT::_S_token_subexpr_end))
160	__throw_regex_error(ecode: regex_constants::error_paren,
161	what: "Parenthesis is not closed.");
162	auto __tmp = _M_pop();
163	__tmp._M_append(_M_nfa->_M_insert_accept());
164	_M_stack.push(
165	_StateSeqT(
166	*_M_nfa,
167	_M_nfa->_M_insert_lookahead(__tmp._M_start, __neg)));
168	}
169	else
170	return false;
171	return true;
172	}
173
174	template<typename _TraitsT>
175	bool
176	_Compiler<_TraitsT>::
177	_M_quantifier()
178	{
179	bool __neg = (_M_flags & regex_constants::ECMAScript);
180	auto __init = [this, &__neg]()
181	{
182	if (_M_stack.empty())
183	__throw_regex_error(ecode: regex_constants::error_badrepeat,
184	what: "Nothing to repeat before a quantifier.");
185	__neg = __neg && _M_match_token(token: _ScannerT::_S_token_opt);
186	};
187	if (_M_match_token(token: _ScannerT::_S_token_closure0))
188	{
189	__init();
190	auto __e = _M_pop();
191	_StateSeqT __r(*_M_nfa,
192	_M_nfa->_M_insert_repeat(_S_invalid_state_id,
193	__e._M_start, __neg));
194	__e._M_append(__r);
195	_M_stack.push(__r);
196	}
197	else if (_M_match_token(token: _ScannerT::_S_token_closure1))
198	{
199	__init();
200	auto __e = _M_pop();
201	__e._M_append(_M_nfa->_M_insert_repeat(_S_invalid_state_id,
202	__e._M_start, __neg));
203	_M_stack.push(__e);
204	}
205	else if (_M_match_token(token: _ScannerT::_S_token_opt))
206	{
207	__init();
208	auto __e = _M_pop();
209	auto __end = _M_nfa->_M_insert_dummy();
210	_StateSeqT __r(*_M_nfa,
211	_M_nfa->_M_insert_repeat(_S_invalid_state_id,
212	__e._M_start, __neg));
213	__e._M_append(__end);
214	__r._M_append(__end);
215	_M_stack.push(__r);
216	}
217	else if (_M_match_token(token: _ScannerT::_S_token_interval_begin))
218	{
219	if (_M_stack.empty())
220	__throw_regex_error(ecode: regex_constants::error_badrepeat,
221	what: "Nothing to repeat before a quantifier.");
222	if (!_M_match_token(token: _ScannerT::_S_token_dup_count))
223	__throw_regex_error(ecode: regex_constants::error_badbrace,
224	what: "Unexpected token in brace expression.");
225	_StateSeqT __r(_M_pop());
226	_StateSeqT __e(*_M_nfa, _M_nfa->_M_insert_dummy());
227	long __min_rep = _M_cur_int_value(radix: `10`);
228	bool __infi = false;
229	long __n = `0`;
230
231	// {3
232	if (_M_match_token(token: _ScannerT::_S_token_comma))
233	{
234	if (_M_match_token(token: _ScannerT::_S_token_dup_count)) // {3,7}
235	__n = _M_cur_int_value(radix: `10`) - __min_rep;
236	else
237	__infi = true;
238	}
239	if (!_M_match_token(token: _ScannerT::_S_token_interval_end))
240	__throw_regex_error(ecode: regex_constants::error_brace,
241	what: "Unexpected end of brace expression.");
242
243	__neg = __neg && _M_match_token(token: _ScannerT::_S_token_opt);
244
245	for (long __i = `0`; __i < __min_rep; ++__i)
246	__e._M_append(__r._M_clone());
247
248	if (__infi)
249	{
250	auto __tmp = __r._M_clone();
251	_StateSeqT __s(*_M_nfa,
252	_M_nfa->_M_insert_repeat(_S_invalid_state_id,
253	__tmp._M_start, __neg));
254	__tmp._M_append(__s);
255	__e._M_append(__s);
256	}
257	else
258	{
259	if (__n < `0`)
260	__throw_regex_error(ecode: regex_constants::error_badbrace,
261	what: "Invalid range in brace expression.");
262	auto __end = _M_nfa->_M_insert_dummy();
263	// _M_alt is the "match more" branch, and _M_next is the
264	// "match less" one. Switch _M_alt and _M_next of all created
265	// nodes. This is a hack but IMO works well.
266	std::stack<_StateIdT> __stack;
267	for (long __i = `0`; __i < __n; ++__i)
268	{
269	auto __tmp = __r._M_clone();
270	auto __alt = _M_nfa->_M_insert_repeat(__tmp._M_start,
271	__end, __neg);
272	__stack.push(__alt);
273	__e._M_append(_StateSeqT(*_M_nfa, __alt, __tmp._M_end));
274	}
275	__e._M_append(__end);
276	while (!__stack.empty())
277	{
278	auto& __tmp = (*_M_nfa)[__stack.top()];
279	__stack.pop();
280	std::swap(__tmp._M_next, __tmp._M_alt);
281	}
282	}
283	_M_stack.push(__e);
284	}
285	else
286	return false;
287	return true;
288	}
289
290	#define __INSERT_REGEX_MATCHER(__func, ...)\
291	do {\
292	if (!(_M_flags & regex_constants::icase))\
293	if (!(_M_flags & regex_constants::collate))\
294	__func<false, false>(__VA_ARGS__);\
295	else\
296	__func<false, true>(__VA_ARGS__);\
297	else\
298	if (!(_M_flags & regex_constants::collate))\
299	__func<true, false>(__VA_ARGS__);\
300	else\
301	__func<true, true>(__VA_ARGS__);\
302	} while (false)
303
304	template<typename _TraitsT>
305	bool
306	_Compiler<_TraitsT>::
307	_M_atom()
308	{
309	if (_M_match_token(token: _ScannerT::_S_token_anychar))
310	{
311	if (!(_M_flags & regex_constants::ECMAScript))
312	__INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix);
313	else
314	__INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma);
315	}
316	else if (_M_try_char())
317	__INSERT_REGEX_MATCHER(_M_insert_char_matcher);
318	else if (_M_match_token(token: _ScannerT::_S_token_backref))
319	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->
320	_M_insert_backref(_M_cur_int_value(radix: `10`))));
321	else if (_M_match_token(token: _ScannerT::_S_token_quoted_class))
322	__INSERT_REGEX_MATCHER(_M_insert_character_class_matcher);
323	else if (_M_match_token(token: _ScannerT::_S_token_subexpr_no_group_begin))
324	{
325	_StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_dummy());
326	this->_M_disjunction();
327	if (!_M_match_token(token: _ScannerT::_S_token_subexpr_end))
328	__throw_regex_error(ecode: regex_constants::error_paren,
329	what: "Parenthesis is not closed.");
330	__r._M_append(_M_pop());
331	_M_stack.push(__r);
332	}
333	else if (_M_match_token(token: _ScannerT::_S_token_subexpr_begin))
334	{
335	_StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_subexpr_begin());
336	this->_M_disjunction();
337	if (!_M_match_token(token: _ScannerT::_S_token_subexpr_end))
338	__throw_regex_error(ecode: regex_constants::error_paren,
339	what: "Parenthesis is not closed.");
340	__r._M_append(_M_pop());
341	__r._M_append(_M_nfa->_M_insert_subexpr_end());
342	_M_stack.push(__r);
343	}
344	else if (!_M_bracket_expression())
345	return false;
346	return true;
347	}
348
349	template<typename _TraitsT>
350	bool
351	_Compiler<_TraitsT>::
352	_M_bracket_expression()
353	{
354	bool __neg =
355	_M_match_token(token: _ScannerT::_S_token_bracket_neg_begin);
356	if (!(__neg \|\| _M_match_token(token: _ScannerT::_S_token_bracket_begin)))
357	return false;
358	__INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg);
359	return true;
360	}
361	#undef __INSERT_REGEX_MATCHER
362
363	template<typename _TraitsT>
364	template<bool __icase, bool __collate>
365	void
366	_Compiler<_TraitsT>::
367	_M_insert_any_matcher_ecma()
368	{
369	_M_stack.push(_StateSeqT(*_M_nfa,
370	_M_nfa->_M_insert_matcher
371	(_AnyMatcher<_TraitsT, true, __icase, __collate>
372	(_M_traits))));
373	}
374
375	template<typename _TraitsT>
376	template<bool __icase, bool __collate>
377	void
378	_Compiler<_TraitsT>::
379	_M_insert_any_matcher_posix()
380	{
381	_M_stack.push(_StateSeqT(*_M_nfa,
382	_M_nfa->_M_insert_matcher
383	(_AnyMatcher<_TraitsT, false, __icase, __collate>
384	(_M_traits))));
385	}
386
387	template<typename _TraitsT>
388	template<bool __icase, bool __collate>
389	void
390	_Compiler<_TraitsT>::
391	_M_insert_char_matcher()
392	{
393	_M_stack.push(_StateSeqT(*_M_nfa,
394	_M_nfa->_M_insert_matcher
395	(_CharMatcher<_TraitsT, __icase, __collate>
396	(_M_value[`0`], _M_traits))));
397	}
398
399	template<typename _TraitsT>
400	template<bool __icase, bool __collate>
401	void
402	_Compiler<_TraitsT>::
403	_M_insert_character_class_matcher()
404	{
405	__glibcxx_assert(_M_value.size() == `1`);
406	_BracketMatcher<__icase, __collate> __matcher
407	(_M_ctype.is(_CtypeT::upper, _M_value[`0`]), _M_traits);
408	__matcher._M_add_character_class(_M_value, false);
409	__matcher._M_ready();
410	_M_stack.push(_StateSeqT(*_M_nfa,
411	_M_nfa->_M_insert_matcher(std::move(__matcher))));
412	}
413
414	template<typename _TraitsT>
415	template<bool __icase, bool __collate>
416	void
417	_Compiler<_TraitsT>::
418	_M_insert_bracket_matcher(bool __neg)
419	{
420	_BracketMatcher<__icase, __collate> __matcher(__neg, _M_traits);
421	_BracketState __last_char;
422	if (_M_try_char())
423	__last_char.set(_M_value[`0`]);
424	else if (_M_match_token(token: _ScannerT::_S_token_bracket_dash))
425	// Dash as first character is a normal character.
426	__last_char.set(`'-'`);
427	while (_M_expression_term(__last_char, __matcher))
428	;
429	if (__last_char._M_is_char())
430	__matcher._M_add_char(__last_char.get());
431	__matcher._M_ready();
432	_M_stack.push(_StateSeqT(
433	*_M_nfa,
434	_M_nfa->_M_insert_matcher(std::move(__matcher))));
435	}
436
437	template<typename _TraitsT>
438	template<bool __icase, bool __collate>
439	bool
440	_Compiler<_TraitsT>::
441	_M_expression_term(_BracketState& __last_char,
442	_BracketMatcher<__icase, __collate>& __matcher)
443	{
444	if (_M_match_token(token: _ScannerT::_S_token_bracket_end))
445	return false;
446
447	// Add any previously cached char into the matcher and update cache.
448	const auto __push_char = [&](_CharT __ch)
449	{
450	if (__last_char._M_is_char())
451	__matcher._M_add_char(__last_char.get());
452	__last_char.set(__ch);
453	};
454	// Add any previously cached char into the matcher and update cache.
455	const auto __push_class = [&]
456	{
457	if (__last_char._M_is_char())
458	__matcher._M_add_char(__last_char.get());
459	// We don't cache anything here, just record that the last thing
460	// processed was a character class (or similar).
461	__last_char.reset(_BracketState::_Type::_Class);
462	};
463
464	if (_M_match_token(token: _ScannerT::_S_token_collsymbol))
465	{
466	auto __symbol = __matcher._M_add_collate_element(_M_value);
467	if (__symbol.size() == `1`)
468	__push_char(__symbol[`0`]);
469	else
470	__push_class();
471	}
472	else if (_M_match_token(token: _ScannerT::_S_token_equiv_class_name))
473	{
474	__push_class();
475	__matcher._M_add_equivalence_class(_M_value);
476	}
477	else if (_M_match_token(token: _ScannerT::_S_token_char_class_name))
478	{
479	__push_class();
480	__matcher._M_add_character_class(_M_value, false);
481	}
482	else if (_M_try_char())
483	__push_char(_M_value[`0`]);
484	// POSIX doesn't allow '-' as a start-range char (say [a-z--0]),
485	// except when the '-' is the first or last character in the bracket
486	// expression ([--0]). ECMAScript treats all '-' after a range as a
487	// normal character. Also see above, where _M_expression_term gets called.
488	//
489	// As a result, POSIX rejects [-----], but ECMAScript doesn't.
490	// Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax.
491	// Clang (3.5) always uses ECMAScript style even in its POSIX syntax.
492	//
493	// It turns out that no one reads BNFs ;)
494	else if (_M_match_token(token: _ScannerT::_S_token_bracket_dash))
495	{
496	if (_M_match_token(token: _ScannerT::_S_token_bracket_end))
497	{
498	// For "-]" the dash is a literal character.
499	__push_char(`'-'`);
500	return false;
501	}
502	else if (__last_char._M_is_class())
503	{
504	// "\\w-" is invalid, start of range must be a single char.
505	__throw_regex_error(ecode: regex_constants::error_range,
506	what: "Invalid start of range in bracket expression.");
507	}
508	else if (__last_char._M_is_char())
509	{
510	if (_M_try_char())
511	{
512	// "x-y"
513	__matcher._M_make_range(__last_char.get(), _M_value[`0`]);
514	__last_char.reset();
515	}
516	else if (_M_match_token(token: _ScannerT::_S_token_bracket_dash))
517	{
518	// "x--"
519	__matcher._M_make_range(__last_char.get(), `'-'`);
520	__last_char.reset();
521	}
522	else
523	__throw_regex_error(ecode: regex_constants::error_range,
524	what: "Invalid end of range in bracket expression.");
525	}
526	else if (_M_flags & regex_constants::ECMAScript)
527	{
528	// A dash that is not part of an existing range. Might be the
529	// start of a new range, or might just be a literal '-' char.
530	// Only ECMAScript allows that in the middle of a bracket expr.
531	__push_char(`'-'`);
532	}
533	else
534	__throw_regex_error(ecode: regex_constants::error_range,
535	what: "Invalid dash in bracket expression.");
536	}
537	else if (_M_match_token(token: _ScannerT::_S_token_quoted_class))
538	{
539	__push_class();
540	__matcher._M_add_character_class(_M_value,
541	_M_ctype.is(_CtypeT::upper,
542	_M_value[`0`]));
543	}
544	else
545	__throw_regex_error(ecode: regex_constants::error_brack,
546	what: "Unexpected character in bracket expression.");
547
548	return true;
549	}
550
551	template<typename _TraitsT>
552	bool
553	_Compiler<_TraitsT>::
554	_M_try_char()
555	{
556	bool __is_char = false;
557	if (_M_match_token(token: _ScannerT::_S_token_oct_num))
558	{
559	__is_char = true;
560	_M_value.assign(`1`, _M_cur_int_value(radix: `8`));
561	}
562	else if (_M_match_token(token: _ScannerT::_S_token_hex_num))
563	{
564	__is_char = true;
565	_M_value.assign(`1`, _M_cur_int_value(radix: `16`));
566	}
567	else if (_M_match_token(token: _ScannerT::_S_token_ord_char))
568	__is_char = true;
569	return __is_char;
570	}
571
572	template<typename _TraitsT>
573	bool
574	_Compiler<_TraitsT>::
575	_M_match_token(_TokenT __token)
576	{
577	if (__token == _M_scanner._M_get_token())
578	{
579	_M_value = _M_scanner._M_get_value();
580	_M_scanner._M_advance();
581	return true;
582	}
583	return false;
584	}
585
586	template<typename _TraitsT>
587	int
588	_Compiler<_TraitsT>::
589	_M_cur_int_value(int __radix)
590	{
591	int __v = `0`;
592	for (_CharT __c : _M_value)
593	if (__builtin_mul_overflow(__v, __radix, &__v)
594	\|\| __builtin_add_overflow(__v, _M_traits.value(__c, __radix), &__v))
595	std::__throw_regex_error(ecode: regex_constants::error_backref,
596	what: "invalid back reference");
597	return __v;
598	}
599
600	template<typename _TraitsT, bool __icase, bool __collate>
601	bool
602	_BracketMatcher<_TraitsT, __icase, __collate>::
603	_M_apply(_CharT __ch, false_type) const
604	{
605	return [this, __ch]
606	{
607	if (std::binary_search(_M_char_set.begin(), _M_char_set.end(),
608	_M_translator._M_translate(__ch)))
609	return true;
610	auto __s = _M_translator._M_transform(__ch);
611	for (auto& __it : _M_range_set)
612	if (_M_translator._M_match_range(__it.first, __it.second, __s))
613	return true;
614	if (_M_traits.isctype(__ch, _M_class_set))
615	return true;
616	if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(),
617	_M_traits.transform_primary(&__ch, &__ch+`1`))
618	!= _M_equiv_set.end())
619	return true;
620	for (auto& __it : _M_neg_class_set)
621	if (!_M_traits.isctype(__ch, __it))
622	return true;
623	return false;
624	}() ^ _M_is_non_matching;
625	}
626	} // namespace __detail
627
628	_GLIBCXX_END_NAMESPACE_VERSION
629	} // namespace
630

Provided by KDAB

Definitions

_Compiler
_M_disjunction
_M_alternative
_M_term
_M_assertion
_M_quantifier
_M_atom
_M_bracket_expression
_M_insert_any_matcher_ecma
_M_insert_any_matcher_posix
_M_insert_char_matcher
_M_insert_character_class_matcher
_M_insert_bracket_matcher
_M_expression_term
_M_try_char
_M_match_token
_M_cur_int_value

Start learning QML with our Intro Training

Find out more

Definitions

source code of include/c++/11/bits/regex_compiler.tcc