1 | // class template regex -*- C++ -*- |
2 | |
3 | // Copyright (C) 2013-2021 Free Software Foundation, Inc. |
4 | // |
5 | // This file is part of the GNU ISO C++ Library. This library is free |
6 | // software; you can redistribute it and/or modify it under the |
7 | // terms of the GNU General Public License as published by the |
8 | // Free Software Foundation; either version 3, or (at your option) |
9 | // any later version. |
10 | |
11 | // This library is distributed in the hope that it will be useful, |
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | // GNU General Public License for more details. |
15 | |
16 | // Under Section 7 of GPL version 3, you are granted additional |
17 | // permissions described in the GCC Runtime Library Exception, version |
18 | // 3.1, as published by the Free Software Foundation. |
19 | |
20 | // You should have received a copy of the GNU General Public License and |
21 | // a copy of the GCC Runtime Library Exception along with this program; |
22 | // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
23 | // <http://www.gnu.org/licenses/>. |
24 | |
25 | /** |
26 | * @file bits/regex_executor.h |
27 | * This is an internal header file, included by other library headers. |
28 | * Do not attempt to use it directly. @headername{regex} |
29 | */ |
30 | |
31 | // FIXME convert comments to doxygen format. |
32 | |
33 | namespace std _GLIBCXX_VISIBILITY(default) |
34 | { |
35 | _GLIBCXX_BEGIN_NAMESPACE_VERSION |
36 | |
37 | namespace __detail |
38 | { |
39 | /** |
40 | * @addtogroup regex-detail |
41 | * @{ |
42 | */ |
43 | |
44 | /** |
45 | * @brief Takes a regex and an input string and does the matching. |
46 | * |
47 | * The %_Executor class has two modes: DFS mode and BFS mode, controlled |
48 | * by the template parameter %__dfs_mode. |
49 | */ |
50 | template<typename _BiIter, typename _Alloc, typename _TraitsT, |
51 | bool __dfs_mode> |
52 | class _Executor |
53 | { |
54 | using __search_mode = integral_constant<bool, __dfs_mode>; |
55 | using __dfs = true_type; |
56 | using __bfs = false_type; |
57 | |
58 | enum class _Match_mode : unsigned char { _Exact, _Prefix }; |
59 | |
60 | public: |
61 | typedef typename iterator_traits<_BiIter>::value_type _CharT; |
62 | typedef basic_regex<_CharT, _TraitsT> _RegexT; |
63 | typedef std::vector<sub_match<_BiIter>, _Alloc> _ResultsVec; |
64 | typedef regex_constants::match_flag_type _FlagT; |
65 | typedef typename _TraitsT::char_class_type _ClassT; |
66 | typedef _NFA<_TraitsT> _NFAT; |
67 | |
68 | public: |
69 | _Executor(_BiIter __begin, |
70 | _BiIter __end, |
71 | _ResultsVec& __results, |
72 | const _RegexT& __re, |
73 | _FlagT __flags) |
74 | : _M_begin(__begin), |
75 | _M_end(__end), |
76 | _M_re(__re), |
77 | _M_nfa(*__re._M_automaton), |
78 | _M_results(__results), |
79 | _M_rep_count(_M_nfa.size()), |
80 | _M_states(_M_nfa._M_start(), _M_nfa.size()), |
81 | _M_flags(__flags) |
82 | { |
83 | using namespace regex_constants; |
84 | if (__flags & match_prev_avail) // ignore not_bol and not_bow |
85 | _M_flags &= ~(match_not_bol | match_not_bow); |
86 | } |
87 | |
88 | // Set matched when string exactly matches the pattern. |
89 | bool |
90 | _M_match() |
91 | { |
92 | _M_current = _M_begin; |
93 | return _M_main(match_mode: _Match_mode::_Exact); |
94 | } |
95 | |
96 | // Set matched when some prefix of the string matches the pattern. |
97 | bool |
98 | _M_search_from_first() |
99 | { |
100 | _M_current = _M_begin; |
101 | return _M_main(match_mode: _Match_mode::_Prefix); |
102 | } |
103 | |
104 | bool |
105 | _M_search(); |
106 | |
107 | private: |
108 | void |
109 | _M_rep_once_more(_Match_mode __match_mode, _StateIdT); |
110 | |
111 | void |
112 | _M_handle_repeat(_Match_mode, _StateIdT); |
113 | |
114 | void |
115 | _M_handle_subexpr_begin(_Match_mode, _StateIdT); |
116 | |
117 | void |
118 | _M_handle_subexpr_end(_Match_mode, _StateIdT); |
119 | |
120 | void |
121 | _M_handle_line_begin_assertion(_Match_mode, _StateIdT); |
122 | |
123 | void |
124 | _M_handle_line_end_assertion(_Match_mode, _StateIdT); |
125 | |
126 | void |
127 | _M_handle_word_boundary(_Match_mode, _StateIdT); |
128 | |
129 | void |
130 | _M_handle_subexpr_lookahead(_Match_mode, _StateIdT); |
131 | |
132 | void |
133 | _M_handle_match(_Match_mode, _StateIdT); |
134 | |
135 | void |
136 | _M_handle_backref(_Match_mode, _StateIdT); |
137 | |
138 | void |
139 | _M_handle_accept(_Match_mode, _StateIdT); |
140 | |
141 | void |
142 | _M_handle_alternative(_Match_mode, _StateIdT); |
143 | |
144 | void |
145 | _M_dfs(_Match_mode __match_mode, _StateIdT __start); |
146 | |
147 | bool |
148 | _M_main(_Match_mode __match_mode) |
149 | { return _M_main_dispatch(__match_mode, __search_mode{}); } |
150 | |
151 | bool |
152 | _M_main_dispatch(_Match_mode __match_mode, __dfs); |
153 | |
154 | bool |
155 | _M_main_dispatch(_Match_mode __match_mode, __bfs); |
156 | |
157 | bool |
158 | _M_is_word(_CharT __ch) const |
159 | { |
160 | static const _CharT __s[2] = { 'w' }; |
161 | return _M_re._M_automaton->_M_traits.isctype |
162 | (__ch, _M_re._M_automaton->_M_traits.lookup_classname(__s, __s+1)); |
163 | } |
164 | |
165 | bool |
166 | _M_at_begin() const |
167 | { |
168 | if (_M_current == _M_begin) |
169 | { |
170 | // match_not_bol means ^ does not match [_M_begin,_M_begin) |
171 | if (_M_flags & regex_constants::match_not_bol) |
172 | return false; |
173 | // match_prev_avail means _M_begin is not the start of the input. |
174 | if (_M_flags & regex_constants::match_prev_avail) |
175 | { |
176 | // For ECMAScript multiline matches, check if the previous |
177 | // character is a line terminator. |
178 | if (_M_match_multiline()) |
179 | return _M_is_line_terminator(c: *std::prev(_M_current)); |
180 | else |
181 | return false; |
182 | } |
183 | else // ^ matches at _M_begin |
184 | return true; |
185 | } |
186 | else if (_M_match_multiline()) |
187 | return _M_is_line_terminator(c: *std::prev(_M_current)); |
188 | else |
189 | return false; |
190 | } |
191 | |
192 | bool |
193 | _M_at_end() const |
194 | { |
195 | if (_M_current == _M_end) |
196 | return !(_M_flags & regex_constants::match_not_eol); |
197 | else if (_M_match_multiline()) |
198 | return _M_is_line_terminator(c: *_M_current); |
199 | else |
200 | return false; |
201 | } |
202 | |
203 | bool |
204 | _M_word_boundary() const; |
205 | |
206 | bool |
207 | _M_lookahead(_StateIdT __next); |
208 | |
209 | bool |
210 | _M_is_line_terminator(_CharT __c) const |
211 | { |
212 | const auto& __traits = _M_re._M_automaton->_M_traits; |
213 | const auto& __ct = use_facet<ctype<_CharT>>(__traits.getloc()); |
214 | const char __n{ __ct.narrow(__c, ' ') }; |
215 | if (__n == '\n') |
216 | return true; |
217 | if (_M_re._M_automaton->_M_options() & regex_constants::ECMAScript) |
218 | { |
219 | if (__n == '\r') |
220 | return true; |
221 | // FIXME: U+2028 (line separator) and U+2029 (paragraph separator) |
222 | } |
223 | return false; |
224 | } |
225 | |
226 | bool |
227 | _M_match_multiline() const noexcept |
228 | { |
229 | constexpr auto __m |
230 | = regex_constants::ECMAScript | regex_constants::__multiline; |
231 | return (_M_re._M_automaton->_M_options() & __m) == __m; |
232 | } |
233 | |
234 | // Holds additional information used in BFS-mode. |
235 | template<typename _SearchMode, typename _ResultsVec> |
236 | struct _State_info; |
237 | |
238 | template<typename _ResultsVec> |
239 | struct _State_info<__bfs, _ResultsVec> |
240 | { |
241 | explicit |
242 | _State_info(_StateIdT __start, size_t __n) |
243 | : _M_visited_states(new bool[__n]()), _M_start(__start) |
244 | { } |
245 | |
246 | bool _M_visited(_StateIdT __i) |
247 | { |
248 | if (_M_visited_states[__i]) |
249 | return true; |
250 | _M_visited_states[__i] = true; |
251 | return false; |
252 | } |
253 | |
254 | void _M_queue(_StateIdT __i, const _ResultsVec& __res) |
255 | { _M_match_queue.emplace_back(__i, __res); } |
256 | |
257 | // Dummy implementations for BFS mode. |
258 | _BiIter* _M_get_sol_pos() { return nullptr; } |
259 | |
260 | // Saves states that need to be considered for the next character. |
261 | vector<pair<_StateIdT, _ResultsVec>> _M_match_queue; |
262 | // Indicates which states are already visited. |
263 | unique_ptr<bool[]> _M_visited_states; |
264 | // To record current solution. |
265 | _StateIdT _M_start; |
266 | }; |
267 | |
268 | template<typename _ResultsVec> |
269 | struct _State_info<__dfs, _ResultsVec> |
270 | { |
271 | explicit |
272 | _State_info(_StateIdT __start, size_t) : _M_start(__start) |
273 | { } |
274 | |
275 | // Dummy implementations for DFS mode. |
276 | bool _M_visited(_StateIdT) const { return false; } |
277 | void _M_queue(_StateIdT, const _ResultsVec&) { } |
278 | |
279 | _BiIter* _M_get_sol_pos() { return &_M_sol_pos; } |
280 | |
281 | // To record current solution. |
282 | _StateIdT _M_start; |
283 | _BiIter _M_sol_pos; |
284 | }; |
285 | |
286 | public: |
287 | _ResultsVec _M_cur_results; |
288 | _BiIter _M_current; |
289 | _BiIter _M_begin; |
290 | const _BiIter _M_end; |
291 | const _RegexT& _M_re; |
292 | const _NFAT& _M_nfa; |
293 | _ResultsVec& _M_results; |
294 | vector<pair<_BiIter, int>> _M_rep_count; |
295 | _State_info<__search_mode, _ResultsVec> _M_states; |
296 | _FlagT _M_flags; |
297 | // Do we have a solution so far? |
298 | bool _M_has_sol; |
299 | }; |
300 | |
301 | ///@} regex-detail |
302 | } // namespace __detail |
303 | _GLIBCXX_END_NAMESPACE_VERSION |
304 | } // namespace std |
305 | |
306 | #include <bits/regex_executor.tcc> |
307 | |