1 | /////////////////////////////////////////////////////////////////////////////// |
2 | /// \file regex_compiler.hpp |
3 | /// Contains the definition of regex_compiler, a factory for building regex objects |
4 | /// from strings. |
5 | // |
6 | // Copyright 2008 Eric Niebler. Distributed under the Boost |
7 | // Software License, Version 1.0. (See accompanying file |
8 | // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
9 | |
10 | #ifndef BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005 |
11 | #define BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005 |
12 | |
13 | // MS compatible compilers support #pragma once |
14 | #if defined(_MSC_VER) |
15 | # pragma once |
16 | #endif |
17 | |
18 | #include <map> |
19 | #include <boost/config.hpp> |
20 | #include <boost/assert.hpp> |
21 | #include <boost/next_prior.hpp> |
22 | #include <boost/range/begin.hpp> |
23 | #include <boost/range/end.hpp> |
24 | #include <boost/mpl/assert.hpp> |
25 | #include <boost/throw_exception.hpp> |
26 | #include <boost/type_traits/is_same.hpp> |
27 | #include <boost/type_traits/is_pointer.hpp> |
28 | #include <boost/utility/enable_if.hpp> |
29 | #include <boost/iterator/iterator_traits.hpp> |
30 | #include <boost/xpressive/basic_regex.hpp> |
31 | #include <boost/xpressive/detail/dynamic/parser.hpp> |
32 | #include <boost/xpressive/detail/dynamic/parse_charset.hpp> |
33 | #include <boost/xpressive/detail/dynamic/parser_enum.hpp> |
34 | #include <boost/xpressive/detail/dynamic/parser_traits.hpp> |
35 | #include <boost/xpressive/detail/core/linker.hpp> |
36 | #include <boost/xpressive/detail/core/optimize.hpp> |
37 | |
38 | namespace boost { namespace xpressive |
39 | { |
40 | |
41 | /////////////////////////////////////////////////////////////////////////////// |
42 | // regex_compiler |
43 | // |
44 | /// \brief Class template regex_compiler is a factory for building basic_regex objects from a string. |
45 | /// |
46 | /// Class template regex_compiler is used to construct a basic_regex object from a string. The string |
47 | /// should contain a valid regular expression. You can imbue a regex_compiler object with a locale, |
48 | /// after which all basic_regex objects created with that regex_compiler object will use that locale. |
49 | /// After creating a regex_compiler object, and optionally imbueing it with a locale, you can call the |
50 | /// compile() method to construct a basic_regex object, passing it the string representing the regular |
51 | /// expression. You can call compile() multiple times on the same regex_compiler object. Two basic_regex |
52 | /// objects compiled from the same string will have different regex_id's. |
53 | template<typename BidiIter, typename RegexTraits, typename CompilerTraits> |
54 | struct regex_compiler |
55 | { |
56 | typedef BidiIter iterator_type; |
57 | typedef typename iterator_value<BidiIter>::type char_type; |
58 | typedef regex_constants::syntax_option_type flag_type; |
59 | typedef RegexTraits traits_type; |
60 | typedef typename traits_type::string_type string_type; |
61 | typedef typename traits_type::locale_type locale_type; |
62 | typedef typename traits_type::char_class_type char_class_type; |
63 | |
64 | explicit regex_compiler(RegexTraits const &traits = RegexTraits()) |
65 | : mark_count_(0) |
66 | , hidden_mark_count_(0) |
67 | , traits_(traits) |
68 | , upper_(0) |
69 | , self_() |
70 | , rules_() |
71 | { |
72 | this->upper_ = lookup_classname(this->rxtraits(), "upper" ); |
73 | } |
74 | |
75 | /////////////////////////////////////////////////////////////////////////// |
76 | // imbue |
77 | /// Specify the locale to be used by a regex_compiler. |
78 | /// |
79 | /// \param loc The locale that this regex_compiler should use. |
80 | /// \return The previous locale. |
81 | locale_type imbue(locale_type loc) |
82 | { |
83 | locale_type oldloc = this->traits_.imbue(loc); |
84 | this->upper_ = lookup_classname(this->rxtraits(), "upper" ); |
85 | return oldloc; |
86 | } |
87 | |
88 | /////////////////////////////////////////////////////////////////////////// |
89 | // getloc |
90 | /// Get the locale used by a regex_compiler. |
91 | /// |
92 | /// \return The locale used by this regex_compiler. |
93 | locale_type getloc() const |
94 | { |
95 | return this->traits_.getloc(); |
96 | } |
97 | |
98 | /////////////////////////////////////////////////////////////////////////// |
99 | // compile |
100 | /// Builds a basic_regex object from a range of characters. |
101 | /// |
102 | /// \param begin The beginning of a range of characters representing the |
103 | /// regular expression to compile. |
104 | /// \param end The end of a range of characters representing the |
105 | /// regular expression to compile. |
106 | /// \param flags Optional bitmask that determines how the pat string is |
107 | /// interpreted. (See syntax_option_type.) |
108 | /// \return A basic_regex object corresponding to the regular expression |
109 | /// represented by the character range. |
110 | /// \pre InputIter is a model of the InputIterator concept. |
111 | /// \pre [begin,end) is a valid range. |
112 | /// \pre The range of characters specified by [begin,end) contains a |
113 | /// valid string-based representation of a regular expression. |
114 | /// \throw regex_error when the range of characters has invalid regular |
115 | /// expression syntax. |
116 | template<typename InputIter> |
117 | basic_regex<BidiIter> |
118 | compile(InputIter begin, InputIter end, flag_type flags = regex_constants::ECMAScript) |
119 | { |
120 | typedef typename iterator_category<InputIter>::type category; |
121 | return this->compile_(begin, end, flags, category()); |
122 | } |
123 | |
124 | /// \overload |
125 | /// |
126 | template<typename InputRange> |
127 | typename disable_if<is_pointer<InputRange>, basic_regex<BidiIter> >::type |
128 | compile(InputRange const &pat, flag_type flags = regex_constants::ECMAScript) |
129 | { |
130 | return this->compile(boost::begin(pat), boost::end(pat), flags); |
131 | } |
132 | |
133 | /// \overload |
134 | /// |
135 | basic_regex<BidiIter> |
136 | compile(char_type const *begin, flag_type flags = regex_constants::ECMAScript) |
137 | { |
138 | BOOST_ASSERT(0 != begin); |
139 | char_type const *end = begin + std::char_traits<char_type>::length(begin); |
140 | return this->compile(begin, end, flags); |
141 | } |
142 | |
143 | /// \overload |
144 | /// |
145 | basic_regex<BidiIter> compile(char_type const *begin, std::size_t size, flag_type flags) |
146 | { |
147 | BOOST_ASSERT(0 != begin); |
148 | char_type const *end = begin + size; |
149 | return this->compile(begin, end, flags); |
150 | } |
151 | |
152 | /////////////////////////////////////////////////////////////////////////// |
153 | // operator[] |
154 | /// Return a reference to the named regular expression. If no such named |
155 | /// regular expression exists, create a new regular expression and return |
156 | /// a reference to it. |
157 | /// |
158 | /// \param name A std::string containing the name of the regular expression. |
159 | /// \pre The string is not empty. |
160 | /// \throw bad_alloc on allocation failure. |
161 | basic_regex<BidiIter> &operator [](string_type const &name) |
162 | { |
163 | BOOST_ASSERT(!name.empty()); |
164 | return this->rules_[name]; |
165 | } |
166 | |
167 | /// \overload |
168 | /// |
169 | basic_regex<BidiIter> const &operator [](string_type const &name) const |
170 | { |
171 | BOOST_ASSERT(!name.empty()); |
172 | return this->rules_[name]; |
173 | } |
174 | |
175 | private: |
176 | |
177 | typedef detail::escape_value<char_type, char_class_type> escape_value; |
178 | typedef detail::alternate_matcher<detail::alternates_vector<BidiIter>, RegexTraits> alternate_matcher; |
179 | |
180 | /////////////////////////////////////////////////////////////////////////// |
181 | // compile_ |
182 | /// INTERNAL ONLY |
183 | template<typename FwdIter> |
184 | basic_regex<BidiIter> compile_(FwdIter begin, FwdIter end, flag_type flags, std::forward_iterator_tag) |
185 | { |
186 | BOOST_MPL_ASSERT((is_same<char_type, typename iterator_value<FwdIter>::type>)); |
187 | using namespace regex_constants; |
188 | this->reset(); |
189 | this->traits_.flags(flags); |
190 | |
191 | basic_regex<BidiIter> rextmp, *prex = &rextmp; |
192 | FwdIter tmp = begin; |
193 | |
194 | // Check if this regex is a named rule: |
195 | string_type name; |
196 | if(token_group_begin == this->traits_.get_token(tmp, end) && |
197 | BOOST_XPR_ENSURE_(tmp != end, error_paren, "mismatched parenthesis" ) && |
198 | token_rule_assign == this->traits_.get_group_type(tmp, end, name)) |
199 | { |
200 | begin = tmp; |
201 | BOOST_XPR_ENSURE_ |
202 | ( |
203 | begin != end && token_group_end == this->traits_.get_token(begin, end) |
204 | , error_paren |
205 | , "mismatched parenthesis" |
206 | ); |
207 | prex = &this->rules_[name]; |
208 | } |
209 | |
210 | this->self_ = detail::core_access<BidiIter>::get_regex_impl(*prex); |
211 | |
212 | // at the top level, a regex is a sequence of alternates |
213 | detail::sequence<BidiIter> seq = this->parse_alternates(begin, end); |
214 | BOOST_XPR_ENSURE_(begin == end, error_paren, "mismatched parenthesis" ); |
215 | |
216 | // terminate the sequence |
217 | seq += detail::make_dynamic<BidiIter>(detail::end_matcher()); |
218 | |
219 | // bundle the regex information into a regex_impl object |
220 | detail::common_compile(seq.xpr().matchable(), *this->self_, this->rxtraits()); |
221 | |
222 | this->self_->traits_ = new detail::traits_holder<RegexTraits>(this->rxtraits()); |
223 | this->self_->mark_count_ = this->mark_count_; |
224 | this->self_->hidden_mark_count_ = this->hidden_mark_count_; |
225 | |
226 | // References changed, update dependencies. |
227 | this->self_->tracking_update(); |
228 | this->self_.reset(); |
229 | return *prex; |
230 | } |
231 | |
232 | /////////////////////////////////////////////////////////////////////////// |
233 | // compile_ |
234 | /// INTERNAL ONLY |
235 | template<typename InputIter> |
236 | basic_regex<BidiIter> compile_(InputIter begin, InputIter end, flag_type flags, std::input_iterator_tag) |
237 | { |
238 | string_type pat(begin, end); |
239 | return this->compile_(boost::begin(pat), boost::end(pat), flags, std::forward_iterator_tag()); |
240 | } |
241 | |
242 | /////////////////////////////////////////////////////////////////////////// |
243 | // reset |
244 | /// INTERNAL ONLY |
245 | void reset() |
246 | { |
247 | this->mark_count_ = 0; |
248 | this->hidden_mark_count_ = 0; |
249 | this->traits_.flags(regex_constants::ECMAScript); |
250 | } |
251 | |
252 | /////////////////////////////////////////////////////////////////////////// |
253 | // regex_traits |
254 | /// INTERNAL ONLY |
255 | traits_type &rxtraits() |
256 | { |
257 | return this->traits_.traits(); |
258 | } |
259 | |
260 | /////////////////////////////////////////////////////////////////////////// |
261 | // regex_traits |
262 | /// INTERNAL ONLY |
263 | traits_type const &rxtraits() const |
264 | { |
265 | return this->traits_.traits(); |
266 | } |
267 | |
268 | /////////////////////////////////////////////////////////////////////////// |
269 | // parse_alternates |
270 | /// INTERNAL ONLY |
271 | template<typename FwdIter> |
272 | detail::sequence<BidiIter> parse_alternates(FwdIter &begin, FwdIter end) |
273 | { |
274 | using namespace regex_constants; |
275 | int count = 0; |
276 | FwdIter tmp = begin; |
277 | detail::sequence<BidiIter> seq; |
278 | |
279 | do switch(++count) |
280 | { |
281 | case 1: |
282 | seq = this->parse_sequence(tmp, end); |
283 | break; |
284 | case 2: |
285 | seq = detail::make_dynamic<BidiIter>(alternate_matcher()) | seq; |
286 | BOOST_FALLTHROUGH; |
287 | default: |
288 | seq |= this->parse_sequence(tmp, end); |
289 | } |
290 | while((begin = tmp) != end && token_alternate == this->traits_.get_token(tmp, end)); |
291 | |
292 | return seq; |
293 | } |
294 | |
295 | /////////////////////////////////////////////////////////////////////////// |
296 | // parse_group |
297 | /// INTERNAL ONLY |
298 | template<typename FwdIter> |
299 | detail::sequence<BidiIter> parse_group(FwdIter &begin, FwdIter end) |
300 | { |
301 | using namespace regex_constants; |
302 | int mark_nbr = 0; |
303 | bool keeper = false; |
304 | bool lookahead = false; |
305 | bool lookbehind = false; |
306 | bool negative = false; |
307 | string_type name; |
308 | |
309 | detail::sequence<BidiIter> seq, seq_end; |
310 | FwdIter tmp = FwdIter(); |
311 | |
312 | syntax_option_type old_flags = this->traits_.flags(); |
313 | |
314 | switch(this->traits_.get_group_type(begin, end, name)) |
315 | { |
316 | case token_no_mark: |
317 | // Don't process empty groups like (?:) or (?i) |
318 | // BUGBUG this doesn't handle the degenerate (?:)+ correctly |
319 | if(token_group_end == this->traits_.get_token(tmp = begin, end)) |
320 | { |
321 | return this->parse_atom(begin = tmp, end); |
322 | } |
323 | break; |
324 | |
325 | case token_negative_lookahead: |
326 | negative = true; |
327 | BOOST_FALLTHROUGH; |
328 | case token_positive_lookahead: |
329 | lookahead = true; |
330 | break; |
331 | |
332 | case token_negative_lookbehind: |
333 | negative = true; |
334 | BOOST_FALLTHROUGH; |
335 | case token_positive_lookbehind: |
336 | lookbehind = true; |
337 | break; |
338 | |
339 | case token_independent_sub_expression: |
340 | keeper = true; |
341 | break; |
342 | |
343 | case token_comment: |
344 | while(BOOST_XPR_ENSURE_(begin != end, error_paren, "mismatched parenthesis" )) |
345 | { |
346 | switch(this->traits_.get_token(begin, end)) |
347 | { |
348 | case token_group_end: |
349 | return this->parse_atom(begin, end); |
350 | case token_escape: |
351 | BOOST_XPR_ENSURE_(begin != end, error_escape, "incomplete escape sequence" ); |
352 | BOOST_FALLTHROUGH; |
353 | case token_literal: |
354 | ++begin; |
355 | break; |
356 | default: |
357 | break; |
358 | } |
359 | } |
360 | break; |
361 | |
362 | case token_recurse: |
363 | BOOST_XPR_ENSURE_ |
364 | ( |
365 | begin != end && token_group_end == this->traits_.get_token(begin, end) |
366 | , error_paren |
367 | , "mismatched parenthesis" |
368 | ); |
369 | return detail::make_dynamic<BidiIter>(detail::regex_byref_matcher<BidiIter>(this->self_)); |
370 | |
371 | case token_rule_assign: |
372 | BOOST_THROW_EXCEPTION( |
373 | regex_error(error_badrule, "rule assignments must be at the front of the regex" ) |
374 | ); |
375 | break; |
376 | |
377 | case token_rule_ref: |
378 | { |
379 | typedef detail::core_access<BidiIter> access; |
380 | BOOST_XPR_ENSURE_ |
381 | ( |
382 | begin != end && token_group_end == this->traits_.get_token(begin, end) |
383 | , error_paren |
384 | , "mismatched parenthesis" |
385 | ); |
386 | basic_regex<BidiIter> &rex = this->rules_[name]; |
387 | shared_ptr<detail::regex_impl<BidiIter> > impl = access::get_regex_impl(rex); |
388 | this->self_->track_reference(*impl); |
389 | return detail::make_dynamic<BidiIter>(detail::regex_byref_matcher<BidiIter>(impl)); |
390 | } |
391 | |
392 | case token_named_mark: |
393 | mark_nbr = static_cast<int>(++this->mark_count_); |
394 | for(std::size_t i = 0; i < this->self_->named_marks_.size(); ++i) |
395 | { |
396 | BOOST_XPR_ENSURE_(this->self_->named_marks_[i].name_ != name, error_badmark, "named mark already exists" ); |
397 | } |
398 | this->self_->named_marks_.push_back(detail::named_mark<char_type>(name, this->mark_count_)); |
399 | seq = detail::make_dynamic<BidiIter>(detail::mark_begin_matcher(mark_nbr)); |
400 | seq_end = detail::make_dynamic<BidiIter>(detail::mark_end_matcher(mark_nbr)); |
401 | break; |
402 | |
403 | case token_named_mark_ref: |
404 | BOOST_XPR_ENSURE_ |
405 | ( |
406 | begin != end && token_group_end == this->traits_.get_token(begin, end) |
407 | , error_paren |
408 | , "mismatched parenthesis" |
409 | ); |
410 | for(std::size_t i = 0; i < this->self_->named_marks_.size(); ++i) |
411 | { |
412 | if(this->self_->named_marks_[i].name_ == name) |
413 | { |
414 | mark_nbr = static_cast<int>(this->self_->named_marks_[i].mark_nbr_); |
415 | return detail::make_backref_xpression<BidiIter> |
416 | ( |
417 | mark_nbr, this->traits_.flags(), this->rxtraits() |
418 | ); |
419 | } |
420 | } |
421 | BOOST_THROW_EXCEPTION(regex_error(error_badmark, "invalid named back-reference" )); |
422 | break; |
423 | |
424 | default: |
425 | mark_nbr = static_cast<int>(++this->mark_count_); |
426 | seq = detail::make_dynamic<BidiIter>(detail::mark_begin_matcher(mark_nbr)); |
427 | seq_end = detail::make_dynamic<BidiIter>(detail::mark_end_matcher(mark_nbr)); |
428 | break; |
429 | } |
430 | |
431 | // alternates |
432 | seq += this->parse_alternates(begin, end); |
433 | seq += seq_end; |
434 | BOOST_XPR_ENSURE_ |
435 | ( |
436 | begin != end && token_group_end == this->traits_.get_token(begin, end) |
437 | , error_paren |
438 | , "mismatched parenthesis" |
439 | ); |
440 | |
441 | typedef detail::shared_matchable<BidiIter> xpr_type; |
442 | if(lookahead) |
443 | { |
444 | seq += detail::make_independent_end_xpression<BidiIter>(seq.pure()); |
445 | detail::lookahead_matcher<xpr_type> lam(seq.xpr(), negative, seq.pure()); |
446 | seq = detail::make_dynamic<BidiIter>(lam); |
447 | } |
448 | else if(lookbehind) |
449 | { |
450 | seq += detail::make_independent_end_xpression<BidiIter>(seq.pure()); |
451 | detail::lookbehind_matcher<xpr_type> lbm(seq.xpr(), seq.width().value(), negative, seq.pure()); |
452 | seq = detail::make_dynamic<BidiIter>(lbm); |
453 | } |
454 | else if(keeper) // independent sub-expression |
455 | { |
456 | seq += detail::make_independent_end_xpression<BidiIter>(seq.pure()); |
457 | detail::keeper_matcher<xpr_type> km(seq.xpr(), seq.pure()); |
458 | seq = detail::make_dynamic<BidiIter>(km); |
459 | } |
460 | |
461 | // restore the modifiers |
462 | this->traits_.flags(old_flags); |
463 | return seq; |
464 | } |
465 | |
466 | /////////////////////////////////////////////////////////////////////////// |
467 | // parse_charset |
468 | /// INTERNAL ONLY |
469 | template<typename FwdIter> |
470 | detail::sequence<BidiIter> parse_charset(FwdIter &begin, FwdIter end) |
471 | { |
472 | detail::compound_charset<traits_type> chset; |
473 | |
474 | // call out to a helper to actually parse the character set |
475 | detail::parse_charset(begin, end, chset, this->traits_); |
476 | |
477 | return detail::make_charset_xpression<BidiIter> |
478 | ( |
479 | chset |
480 | , this->rxtraits() |
481 | , this->traits_.flags() |
482 | ); |
483 | } |
484 | |
485 | /////////////////////////////////////////////////////////////////////////// |
486 | // parse_atom |
487 | /// INTERNAL ONLY |
488 | template<typename FwdIter> |
489 | detail::sequence<BidiIter> parse_atom(FwdIter &begin, FwdIter end) |
490 | { |
491 | using namespace regex_constants; |
492 | escape_value esc = { 0, 0, 0, detail::escape_char }; |
493 | FwdIter old_begin = begin; |
494 | |
495 | switch(this->traits_.get_token(begin, end)) |
496 | { |
497 | case token_literal: |
498 | return detail::make_literal_xpression<BidiIter> |
499 | ( |
500 | this->parse_literal(begin, end), this->traits_.flags(), this->rxtraits() |
501 | ); |
502 | |
503 | case token_any: |
504 | return detail::make_any_xpression<BidiIter>(this->traits_.flags(), this->rxtraits()); |
505 | |
506 | case token_assert_begin_sequence: |
507 | return detail::make_dynamic<BidiIter>(detail::assert_bos_matcher()); |
508 | |
509 | case token_assert_end_sequence: |
510 | return detail::make_dynamic<BidiIter>(detail::assert_eos_matcher()); |
511 | |
512 | case token_assert_begin_line: |
513 | return detail::make_assert_begin_line<BidiIter>(this->traits_.flags(), this->rxtraits()); |
514 | |
515 | case token_assert_end_line: |
516 | return detail::make_assert_end_line<BidiIter>(this->traits_.flags(), this->rxtraits()); |
517 | |
518 | case token_assert_word_boundary: |
519 | return detail::make_assert_word<BidiIter>(detail::word_boundary<mpl::true_>(), this->rxtraits()); |
520 | |
521 | case token_assert_not_word_boundary: |
522 | return detail::make_assert_word<BidiIter>(detail::word_boundary<mpl::false_>(), this->rxtraits()); |
523 | |
524 | case token_assert_word_begin: |
525 | return detail::make_assert_word<BidiIter>(detail::word_begin(), this->rxtraits()); |
526 | |
527 | case token_assert_word_end: |
528 | return detail::make_assert_word<BidiIter>(detail::word_end(), this->rxtraits()); |
529 | |
530 | case token_escape: |
531 | esc = this->parse_escape(begin, end); |
532 | switch(esc.type_) |
533 | { |
534 | case detail::escape_mark: |
535 | return detail::make_backref_xpression<BidiIter> |
536 | ( |
537 | esc.mark_nbr_, this->traits_.flags(), this->rxtraits() |
538 | ); |
539 | case detail::escape_char: |
540 | return detail::make_char_xpression<BidiIter> |
541 | ( |
542 | esc.ch_, this->traits_.flags(), this->rxtraits() |
543 | ); |
544 | case detail::escape_class: |
545 | return detail::make_posix_charset_xpression<BidiIter> |
546 | ( |
547 | esc.class_ |
548 | , this->is_upper_(*begin++) |
549 | , this->traits_.flags() |
550 | , this->rxtraits() |
551 | ); |
552 | } |
553 | |
554 | case token_group_begin: |
555 | return this->parse_group(begin, end); |
556 | |
557 | case token_charset_begin: |
558 | return this->parse_charset(begin, end); |
559 | |
560 | case token_invalid_quantifier: |
561 | BOOST_THROW_EXCEPTION(regex_error(error_badrepeat, "quantifier not expected" )); |
562 | break; |
563 | |
564 | case token_quote_meta_begin: |
565 | return detail::make_literal_xpression<BidiIter> |
566 | ( |
567 | this->parse_quote_meta(begin, end), this->traits_.flags(), this->rxtraits() |
568 | ); |
569 | |
570 | case token_quote_meta_end: |
571 | BOOST_THROW_EXCEPTION( |
572 | regex_error( |
573 | error_escape |
574 | , "found quote-meta end without corresponding quote-meta begin" |
575 | ) |
576 | ); |
577 | break; |
578 | |
579 | case token_end_of_pattern: |
580 | break; |
581 | |
582 | default: |
583 | begin = old_begin; |
584 | break; |
585 | } |
586 | |
587 | return detail::sequence<BidiIter>(); |
588 | } |
589 | |
590 | /////////////////////////////////////////////////////////////////////////// |
591 | // parse_quant |
592 | /// INTERNAL ONLY |
593 | template<typename FwdIter> |
594 | detail::sequence<BidiIter> parse_quant(FwdIter &begin, FwdIter end) |
595 | { |
596 | BOOST_ASSERT(begin != end); |
597 | detail::quant_spec spec = { 0, 0, false, &this->hidden_mark_count_ }; |
598 | detail::sequence<BidiIter> seq = this->parse_atom(begin, end); |
599 | |
600 | // BUGBUG this doesn't handle the degenerate (?:)+ correctly |
601 | if(!seq.empty() && begin != end && detail::quant_none != seq.quant()) |
602 | { |
603 | if(this->traits_.get_quant_spec(begin, end, spec)) |
604 | { |
605 | BOOST_ASSERT(spec.min_ <= spec.max_); |
606 | |
607 | if(0 == spec.max_) // quant {0,0} is degenerate -- matches nothing. |
608 | { |
609 | seq = this->parse_quant(begin, end); |
610 | } |
611 | else |
612 | { |
613 | seq.repeat(spec); |
614 | } |
615 | } |
616 | } |
617 | |
618 | return seq; |
619 | } |
620 | |
621 | /////////////////////////////////////////////////////////////////////////// |
622 | // parse_sequence |
623 | /// INTERNAL ONLY |
624 | template<typename FwdIter> |
625 | detail::sequence<BidiIter> parse_sequence(FwdIter &begin, FwdIter end) |
626 | { |
627 | detail::sequence<BidiIter> seq; |
628 | |
629 | while(begin != end) |
630 | { |
631 | detail::sequence<BidiIter> seq_quant = this->parse_quant(begin, end); |
632 | |
633 | // did we find a quantified atom? |
634 | if(seq_quant.empty()) |
635 | break; |
636 | |
637 | // chain it to the end of the xpression sequence |
638 | seq += seq_quant; |
639 | } |
640 | |
641 | return seq; |
642 | } |
643 | |
644 | /////////////////////////////////////////////////////////////////////////// |
645 | // parse_literal |
646 | // scan ahead looking for char literals to be globbed together into a string literal |
647 | /// INTERNAL ONLY |
648 | template<typename FwdIter> |
649 | string_type parse_literal(FwdIter &begin, FwdIter end) |
650 | { |
651 | using namespace regex_constants; |
652 | BOOST_ASSERT(begin != end); |
653 | BOOST_ASSERT(token_literal == this->traits_.get_token(begin, end)); |
654 | escape_value esc = { 0, 0, 0, detail::escape_char }; |
655 | string_type literal(1, *begin); |
656 | |
657 | for(FwdIter prev = begin, tmp = ++begin; begin != end; prev = begin, begin = tmp) |
658 | { |
659 | detail::quant_spec spec = { 0, 0, false, &this->hidden_mark_count_ }; |
660 | if(this->traits_.get_quant_spec(tmp, end, spec)) |
661 | { |
662 | if(literal.size() != 1) |
663 | { |
664 | begin = prev; |
665 | literal.erase(boost::prior(literal.end())); |
666 | } |
667 | return literal; |
668 | } |
669 | else switch(this->traits_.get_token(tmp, end)) |
670 | { |
671 | case token_escape: |
672 | esc = this->parse_escape(tmp, end); |
673 | if(detail::escape_char != esc.type_) return literal; |
674 | literal.insert(literal.end(), esc.ch_); |
675 | break; |
676 | case token_literal: |
677 | literal.insert(literal.end(), *tmp++); |
678 | break; |
679 | default: |
680 | return literal; |
681 | } |
682 | } |
683 | |
684 | return literal; |
685 | } |
686 | |
687 | /////////////////////////////////////////////////////////////////////////// |
688 | // parse_quote_meta |
689 | // scan ahead looking for char literals to be globbed together into a string literal |
690 | /// INTERNAL ONLY |
691 | template<typename FwdIter> |
692 | string_type parse_quote_meta(FwdIter &begin, FwdIter end) |
693 | { |
694 | using namespace regex_constants; |
695 | FwdIter old_begin = begin, old_end; |
696 | while(end != (old_end = begin)) |
697 | { |
698 | switch(this->traits_.get_token(begin, end)) |
699 | { |
700 | case token_quote_meta_end: |
701 | return string_type(old_begin, old_end); |
702 | case token_escape: |
703 | BOOST_XPR_ENSURE_(begin != end, error_escape, "incomplete escape sequence" ); |
704 | BOOST_FALLTHROUGH; |
705 | case token_invalid_quantifier: |
706 | case token_literal: |
707 | ++begin; |
708 | break; |
709 | default: |
710 | break; |
711 | } |
712 | } |
713 | return string_type(old_begin, begin); |
714 | } |
715 | |
716 | /////////////////////////////////////////////////////////////////////////////// |
717 | // parse_escape |
718 | /// INTERNAL ONLY |
719 | template<typename FwdIter> |
720 | escape_value parse_escape(FwdIter &begin, FwdIter end) |
721 | { |
722 | BOOST_XPR_ENSURE_(begin != end, regex_constants::error_escape, "incomplete escape sequence" ); |
723 | |
724 | // first, check to see if this can be a backreference |
725 | if(0 < this->rxtraits().value(*begin, 10)) |
726 | { |
727 | // Parse at most 3 decimal digits. |
728 | FwdIter tmp = begin; |
729 | int mark_nbr = detail::toi(tmp, end, this->rxtraits(), 10, 999); |
730 | |
731 | // If the resulting number could conceivably be a backref, then it is. |
732 | if(10 > mark_nbr || mark_nbr <= static_cast<int>(this->mark_count_)) |
733 | { |
734 | begin = tmp; |
735 | escape_value esc = {0, mark_nbr, 0, detail::escape_mark}; |
736 | return esc; |
737 | } |
738 | } |
739 | |
740 | // Not a backreference, defer to the parse_escape helper |
741 | return detail::parse_escape(begin, end, this->traits_); |
742 | } |
743 | |
744 | bool is_upper_(char_type ch) const |
745 | { |
746 | return 0 != this->upper_ && this->rxtraits().isctype(ch, this->upper_); |
747 | } |
748 | |
749 | std::size_t mark_count_; |
750 | std::size_t hidden_mark_count_; |
751 | CompilerTraits traits_; |
752 | typename RegexTraits::char_class_type upper_; |
753 | shared_ptr<detail::regex_impl<BidiIter> > self_; |
754 | std::map<string_type, basic_regex<BidiIter> > rules_; |
755 | }; |
756 | |
757 | }} // namespace boost::xpressive |
758 | |
759 | #endif |
760 | |