1 | /////////////////////////////////////////////////////////////////////////////// |
2 | /// \file regex_token_iterator.hpp |
3 | /// Contains the definition of regex_token_iterator, and STL-compatible iterator |
4 | /// for tokenizing a string using a regular expression. |
5 | // |
6 | // Copyright 2008 Eric Niebler. Distributed under the Boost |
7 | // Software License, Version 1.0. (See accompanying file |
8 | // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
9 | |
10 | #ifndef BOOST_XPRESSIVE_REGEX_TOKEN_ITERATOR_HPP_EAN_10_04_2005 |
11 | #define BOOST_XPRESSIVE_REGEX_TOKEN_ITERATOR_HPP_EAN_10_04_2005 |
12 | |
13 | // MS compatible compilers support #pragma once |
14 | #if defined(_MSC_VER) |
15 | # pragma once |
16 | #endif |
17 | |
18 | #include <vector> |
19 | #include <boost/assert.hpp> |
20 | #include <boost/mpl/assert.hpp> |
21 | #include <boost/type_traits/is_same.hpp> |
22 | #include <boost/type_traits/is_convertible.hpp> |
23 | #include <boost/xpressive/regex_iterator.hpp> |
24 | |
25 | namespace boost { namespace xpressive { namespace detail |
26 | { |
27 | |
28 | ////////////////////////////////////////////////////////////////////////// |
29 | // regex_token_iterator_impl |
30 | // |
31 | template<typename BidiIter> |
32 | struct regex_token_iterator_impl |
33 | : counted_base<regex_token_iterator_impl<BidiIter> > |
34 | { |
35 | typedef sub_match<BidiIter> value_type; |
36 | |
37 | regex_token_iterator_impl |
38 | ( |
39 | BidiIter begin |
40 | , BidiIter cur |
41 | , BidiIter end |
42 | , BidiIter next_search |
43 | , basic_regex<BidiIter> const &rex |
44 | , regex_constants::match_flag_type flags = regex_constants::match_default |
45 | , std::vector<int> subs = std::vector<int>(1, 0) |
46 | , int n = -2 |
47 | , bool not_null = false |
48 | ) |
49 | : iter_(begin, cur, end, next_search, rex, flags, not_null) |
50 | , result_() |
51 | , n_((-2 == n) ? (int)subs.size() - 1 : n) |
52 | , subs_() |
53 | { |
54 | BOOST_ASSERT(0 != subs.size()); |
55 | this->subs_.swap(subs); |
56 | } |
57 | |
58 | bool next() |
59 | { |
60 | if(-1 != this->n_) |
61 | { |
62 | BidiIter cur = this->iter_.state_.cur_; |
63 | if(0 != (++this->n_ %= (int)this->subs_.size()) || this->iter_.next()) |
64 | { |
65 | this->result_ = (-1 == this->subs_[ this->n_ ]) |
66 | ? this->iter_.what_.prefix() |
67 | : this->iter_.what_[ this->subs_[ this->n_ ] ]; |
68 | return true; |
69 | } |
70 | else if(-1 == this->subs_[ this->n_-- ] && cur != this->iter_.state_.end_) |
71 | { |
72 | this->result_ = value_type(cur, this->iter_.state_.end_, true); |
73 | return true; |
74 | } |
75 | } |
76 | |
77 | return false; |
78 | } |
79 | |
80 | bool equal_to(regex_token_iterator_impl<BidiIter> const &that) const |
81 | { |
82 | return this->iter_.equal_to(that.iter_) && this->n_ == that.n_; |
83 | } |
84 | |
85 | regex_iterator_impl<BidiIter> iter_; |
86 | value_type result_; |
87 | int n_; |
88 | std::vector<int> subs_; |
89 | }; |
90 | |
91 | inline int get_mark_number(int i) |
92 | { |
93 | return i; |
94 | } |
95 | |
96 | inline std::vector<int> to_vector(int subs) |
97 | { |
98 | return std::vector<int>(1, subs); |
99 | } |
100 | |
101 | inline std::vector<int> const &to_vector(std::vector<int> const &subs) |
102 | { |
103 | return subs; |
104 | } |
105 | |
106 | template<typename Int, std::size_t Size> |
107 | inline std::vector<int> to_vector(Int const (&sub_matches)[ Size ]) |
108 | { |
109 | // so that people can specify sub-match indices inline with |
110 | // string literals, like "\1\2\3", leave off the trailing '\0' |
111 | std::size_t const size = Size - is_same<Int, char>::value; |
112 | std::vector<int> vect(size); |
113 | for(std::size_t i = 0; i < size; ++i) |
114 | { |
115 | vect[i] = get_mark_number(sub_matches[i]); |
116 | } |
117 | return vect; |
118 | } |
119 | |
120 | template<typename Int> |
121 | inline std::vector<int> to_vector(std::vector<Int> const &sub_matches) |
122 | { |
123 | BOOST_MPL_ASSERT((is_convertible<Int, int>)); |
124 | return std::vector<int>(sub_matches.begin(), sub_matches.end()); |
125 | } |
126 | |
127 | } // namespace detail |
128 | |
129 | ////////////////////////////////////////////////////////////////////////// |
130 | // regex_token_iterator |
131 | // |
132 | template<typename BidiIter> |
133 | struct regex_token_iterator |
134 | { |
135 | typedef basic_regex<BidiIter> regex_type; |
136 | typedef typename iterator_value<BidiIter>::type char_type; |
137 | typedef sub_match<BidiIter> value_type; |
138 | typedef std::ptrdiff_t difference_type; |
139 | typedef value_type const *pointer; |
140 | typedef value_type const &reference; |
141 | typedef std::forward_iterator_tag iterator_category; |
142 | |
143 | /// INTERNAL ONLY |
144 | typedef detail::regex_token_iterator_impl<BidiIter> impl_type_; |
145 | |
146 | /// \post \c *this is the end of sequence iterator. |
147 | regex_token_iterator() |
148 | : impl_() |
149 | { |
150 | } |
151 | |
152 | /// \param begin The beginning of the character range to search. |
153 | /// \param end The end of the character range to search. |
154 | /// \param rex The regex pattern to search for. |
155 | /// \pre \c [begin,end) is a valid range. |
156 | regex_token_iterator |
157 | ( |
158 | BidiIter begin |
159 | , BidiIter end |
160 | , basic_regex<BidiIter> const &rex |
161 | ) |
162 | : impl_() |
163 | { |
164 | if(0 != rex.regex_id()) |
165 | { |
166 | this->impl_ = new impl_type_(begin, begin, end, begin, rex); |
167 | this->next_(); |
168 | } |
169 | } |
170 | |
171 | /// \param begin The beginning of the character range to search. |
172 | /// \param end The end of the character range to search. |
173 | /// \param rex The regex pattern to search for. |
174 | /// \param args A let() expression with argument bindings for semantic actions. |
175 | /// \pre \c [begin,end) is a valid range. |
176 | template<typename LetExpr> |
177 | regex_token_iterator |
178 | ( |
179 | BidiIter begin |
180 | , BidiIter end |
181 | , basic_regex<BidiIter> const &rex |
182 | , detail::let_<LetExpr> const &args |
183 | ) |
184 | : impl_() |
185 | { |
186 | if(0 != rex.regex_id()) |
187 | { |
188 | this->impl_ = new impl_type_(begin, begin, end, begin, rex); |
189 | detail::bind_args(args, this->impl_->iter_.what_); |
190 | this->next_(); |
191 | } |
192 | } |
193 | |
194 | /// \param begin The beginning of the character range to search. |
195 | /// \param end The end of the character range to search. |
196 | /// \param rex The regex pattern to search for. |
197 | /// \param subs A range of integers designating sub-matches to be treated as tokens. |
198 | /// \param flags Optional match flags, used to control how the expression is matched against the sequence. (See match_flag_type.) |
199 | /// \pre \c [begin,end) is a valid range. |
200 | /// \pre \c subs is either an integer greater or equal to -1, |
201 | /// or else an array or non-empty \c std::vector\<\> of such integers. |
202 | template<typename Subs> |
203 | regex_token_iterator |
204 | ( |
205 | BidiIter begin |
206 | , BidiIter end |
207 | , basic_regex<BidiIter> const &rex |
208 | , Subs const &subs |
209 | , regex_constants::match_flag_type flags = regex_constants::match_default |
210 | ) |
211 | : impl_() |
212 | { |
213 | if(0 != rex.regex_id()) |
214 | { |
215 | this->impl_ = new impl_type_(begin, begin, end, begin, rex, flags, detail::to_vector(subs)); |
216 | this->next_(); |
217 | } |
218 | } |
219 | |
220 | /// \param begin The beginning of the character range to search. |
221 | /// \param end The end of the character range to search. |
222 | /// \param rex The regex pattern to search for. |
223 | /// \param subs A range of integers designating sub-matches to be treated as tokens. |
224 | /// \param args A let() expression with argument bindings for semantic actions. |
225 | /// \param flags Optional match flags, used to control how the expression is matched against the sequence. (See match_flag_type.) |
226 | /// \pre \c [begin,end) is a valid range. |
227 | /// \pre \c subs is either an integer greater or equal to -1, |
228 | /// or else an array or non-empty \c std::vector\<\> of such integers. |
229 | template<typename Subs, typename LetExpr> |
230 | regex_token_iterator |
231 | ( |
232 | BidiIter begin |
233 | , BidiIter end |
234 | , basic_regex<BidiIter> const &rex |
235 | , Subs const &subs |
236 | , detail::let_<LetExpr> const &args |
237 | , regex_constants::match_flag_type flags = regex_constants::match_default |
238 | ) |
239 | : impl_() |
240 | { |
241 | if(0 != rex.regex_id()) |
242 | { |
243 | this->impl_ = new impl_type_(begin, begin, end, begin, rex, flags, detail::to_vector(subs)); |
244 | detail::bind_args(args, this->impl_->iter_.what_); |
245 | this->next_(); |
246 | } |
247 | } |
248 | |
249 | /// \post <tt>*this == that</tt> |
250 | regex_token_iterator(regex_token_iterator<BidiIter> const &that) |
251 | : impl_(that.impl_) // COW |
252 | { |
253 | } |
254 | |
255 | /// \post <tt>*this == that</tt> |
256 | regex_token_iterator<BidiIter> &operator =(regex_token_iterator<BidiIter> const &that) |
257 | { |
258 | this->impl_ = that.impl_; // COW |
259 | return *this; |
260 | } |
261 | |
262 | friend bool operator ==(regex_token_iterator<BidiIter> const &left, regex_token_iterator<BidiIter> const &right) |
263 | { |
264 | if(!left.impl_ || !right.impl_) |
265 | { |
266 | return !left.impl_ && !right.impl_; |
267 | } |
268 | |
269 | return left.impl_->equal_to(*right.impl_); |
270 | } |
271 | |
272 | friend bool operator !=(regex_token_iterator<BidiIter> const &left, regex_token_iterator<BidiIter> const &right) |
273 | { |
274 | return !(left == right); |
275 | } |
276 | |
277 | value_type const &operator *() const |
278 | { |
279 | return this->impl_->result_; |
280 | } |
281 | |
282 | value_type const *operator ->() const |
283 | { |
284 | return &this->impl_->result_; |
285 | } |
286 | |
287 | /// If N == -1 then sets *this equal to the end of sequence iterator. |
288 | /// Otherwise if N+1 \< subs.size(), then increments N and sets result equal to |
289 | /// ((subs[N] == -1) ? value_type(what.prefix().str()) : value_type(what[subs[N]].str())). |
290 | /// Otherwise if what.prefix().first != what[0].second and if the element match_prev_avail is |
291 | /// not set in flags then sets it. Then locates the next match as if by calling |
292 | /// regex_search(what[0].second, end, what, *pre, flags), with the following variation: |
293 | /// in the event that the previous match found was of zero length (what[0].length() == 0) |
294 | /// then attempts to find a non-zero length match starting at what[0].second, only if that |
295 | /// fails and provided what[0].second != suffix().second does it look for a (possibly zero |
296 | /// length) match starting from what[0].second + 1. If such a match is found then sets N |
297 | /// equal to zero, and sets result equal to |
298 | /// ((subs[N] == -1) ? value_type(what.prefix().str()) : value_type(what[subs[N]].str())). |
299 | /// Otherwise if no further matches were found, then let last_end be the endpoint of the last |
300 | /// match that was found. Then if last_end != end and subs[0] == -1 sets N equal to -1 and |
301 | /// sets result equal to value_type(last_end, end). Otherwise sets *this equal to the end |
302 | /// of sequence iterator. |
303 | regex_token_iterator<BidiIter> &operator ++() |
304 | { |
305 | this->fork_(); // un-share the implementation |
306 | this->next_(); |
307 | return *this; |
308 | } |
309 | |
310 | regex_token_iterator<BidiIter> operator ++(int) |
311 | { |
312 | regex_token_iterator<BidiIter> tmp(*this); |
313 | ++*this; |
314 | return tmp; |
315 | } |
316 | |
317 | private: |
318 | |
319 | /// INTERNAL ONLY |
320 | void fork_() |
321 | { |
322 | if(1 != this->impl_->use_count()) |
323 | { |
324 | intrusive_ptr<impl_type_> clone = new impl_type_ |
325 | ( |
326 | this->impl_->iter_.state_.begin_ |
327 | , this->impl_->iter_.state_.cur_ |
328 | , this->impl_->iter_.state_.end_ |
329 | , this->impl_->iter_.state_.next_search_ |
330 | , this->impl_->iter_.rex_ |
331 | , this->impl_->iter_.flags_ |
332 | , this->impl_->subs_ |
333 | , this->impl_->n_ |
334 | , this->impl_->iter_.not_null_ |
335 | ); |
336 | |
337 | // only copy the match_results struct if we have to. Note: if the next call |
338 | // to impl_->next() will return false or call regex_search, we don't need to |
339 | // copy the match_results struct. |
340 | if(-1 != this->impl_->n_ && this->impl_->n_ + 1 != static_cast<int>(this->impl_->subs_.size())) |
341 | { |
342 | // BUGBUG This is expensive -- it causes the sequence_stack to be cleared. |
343 | // Find a better way |
344 | clone->iter_.what_ = this->impl_->iter_.what_; |
345 | } |
346 | else |
347 | { |
348 | // At the very least, copy the action args |
349 | detail::core_access<BidiIter>::get_action_args(clone->iter_.what_) |
350 | = detail::core_access<BidiIter>::get_action_args(this->impl_->iter_.what_); |
351 | } |
352 | |
353 | this->impl_.swap(clone); |
354 | } |
355 | } |
356 | |
357 | /// INTERNAL ONLY |
358 | void next_() |
359 | { |
360 | BOOST_ASSERT(this->impl_ && 1 == this->impl_->use_count()); |
361 | if(!this->impl_->next()) |
362 | { |
363 | this->impl_ = 0; |
364 | } |
365 | } |
366 | |
367 | intrusive_ptr<impl_type_> impl_; |
368 | }; |
369 | |
370 | }} // namespace boost::xpressive |
371 | |
372 | #endif |
373 | |