1 | // Boost token_functions.hpp ------------------------------------------------// |
2 | |
3 | // Copyright John R. Bandela 2001. |
4 | |
5 | // Distributed under the Boost Software License, Version 1.0. (See |
6 | // accompanying file LICENSE_1_0.txt or copy at |
7 | // http://www.boost.org/LICENSE_1_0.txt) |
8 | |
9 | // See http://www.boost.org/libs/tokenizer/ for documentation. |
10 | |
11 | // Revision History: |
12 | // 01 Oct 2004 Joaquin M Lopez Munoz |
13 | // Workaround for a problem with string::assign in msvc-stlport |
14 | // 06 Apr 2004 John Bandela |
15 | // Fixed a bug involving using char_delimiter with a true input iterator |
16 | // 28 Nov 2003 Robert Zeh and John Bandela |
17 | // Converted into "fast" functions that avoid using += when |
18 | // the supplied iterator isn't an input_iterator; based on |
19 | // some work done at Archelon and a version that was checked into |
20 | // the boost CVS for a short period of time. |
21 | // 20 Feb 2002 John Maddock |
22 | // Removed using namespace std declarations and added |
23 | // workaround for BOOST_NO_STDC_NAMESPACE (the library |
24 | // can be safely mixed with regex). |
25 | // 06 Feb 2002 Jeremy Siek |
26 | // Added char_separator. |
27 | // 02 Feb 2002 Jeremy Siek |
28 | // Removed tabs and a little cleanup. |
29 | |
30 | |
31 | #ifndef BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_ |
32 | #define BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_ |
33 | |
34 | #include <vector> |
35 | #include <stdexcept> |
36 | #include <string> |
37 | #include <cctype> |
38 | #include <algorithm> // for find_if |
39 | #include <boost/config.hpp> |
40 | #include <boost/assert.hpp> |
41 | #include <boost/type_traits/is_pointer.hpp> |
42 | #include <boost/detail/workaround.hpp> |
43 | #include <boost/mpl/if.hpp> |
44 | #include <boost/throw_exception.hpp> |
45 | #if !defined(BOOST_NO_CWCTYPE) |
46 | #include <cwctype> |
47 | #endif |
48 | |
49 | // |
50 | // the following must not be macros if we are to prefix them |
51 | // with std:: (they shouldn't be macros anyway...) |
52 | // |
53 | #ifdef ispunct |
54 | # undef ispunct |
55 | #endif |
56 | #ifdef iswpunct |
57 | # undef iswpunct |
58 | #endif |
59 | #ifdef isspace |
60 | # undef isspace |
61 | #endif |
62 | #ifdef iswspace |
63 | # undef iswspace |
64 | #endif |
65 | // |
66 | // fix namespace problems: |
67 | // |
68 | #ifdef BOOST_NO_STDC_NAMESPACE |
69 | namespace std{ |
70 | using ::ispunct; |
71 | using ::isspace; |
72 | #if !defined(BOOST_NO_CWCTYPE) |
73 | using ::iswpunct; |
74 | using ::iswspace; |
75 | #endif |
76 | } |
77 | #endif |
78 | |
79 | namespace boost{ |
80 | //=========================================================================== |
81 | // The escaped_list_separator class. Which is a model of TokenizerFunction |
82 | // An escaped list is a super-set of what is commonly known as a comma |
83 | // separated value (csv) list.It is separated into fields by a comma or |
84 | // other character. If the delimiting character is inside quotes, then it is |
85 | // counted as a regular character.To allow for embedded quotes in a field, |
86 | // there can be escape sequences using the \ much like C. |
87 | // The role of the comma, the quotation mark, and the escape |
88 | // character (backslash \), can be assigned to other characters. |
89 | |
90 | struct escaped_list_error : public std::runtime_error{ |
91 | escaped_list_error(const std::string& what_arg):std::runtime_error(what_arg) { } |
92 | }; |
93 | |
94 | |
95 | // The out of the box GCC 2.95 on cygwin does not have a char_traits class. |
96 | // MSVC does not like the following typename |
97 | template <class Char, |
98 | class Traits = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type > |
99 | class escaped_list_separator { |
100 | |
101 | private: |
102 | typedef std::basic_string<Char,Traits> string_type; |
103 | struct char_eq { |
104 | Char e_; |
105 | char_eq(Char e):e_(e) { } |
106 | bool operator()(Char c) { |
107 | return Traits::eq(e_,c); |
108 | } |
109 | }; |
110 | string_type escape_; |
111 | string_type c_; |
112 | string_type quote_; |
113 | bool last_; |
114 | |
115 | bool is_escape(Char e) { |
116 | char_eq f(e); |
117 | return std::find_if(escape_.begin(),escape_.end(),f)!=escape_.end(); |
118 | } |
119 | bool is_c(Char e) { |
120 | char_eq f(e); |
121 | return std::find_if(c_.begin(),c_.end(),f)!=c_.end(); |
122 | } |
123 | bool is_quote(Char e) { |
124 | char_eq f(e); |
125 | return std::find_if(quote_.begin(),quote_.end(),f)!=quote_.end(); |
126 | } |
127 | template <typename iterator, typename Token> |
128 | void do_escape(iterator& next,iterator end,Token& tok) { |
129 | if (++next == end) |
130 | BOOST_THROW_EXCEPTION(escaped_list_error(std::string("cannot end with escape" ))); |
131 | if (Traits::eq(*next,'n')) { |
132 | tok+='\n'; |
133 | return; |
134 | } |
135 | else if (is_quote(e: *next)) { |
136 | tok+=*next; |
137 | return; |
138 | } |
139 | else if (is_c(e: *next)) { |
140 | tok+=*next; |
141 | return; |
142 | } |
143 | else if (is_escape(e: *next)) { |
144 | tok+=*next; |
145 | return; |
146 | } |
147 | else |
148 | BOOST_THROW_EXCEPTION(escaped_list_error(std::string("unknown escape sequence" ))); |
149 | } |
150 | |
151 | public: |
152 | |
153 | explicit escaped_list_separator(Char e = '\\', |
154 | Char c = ',',Char q = '\"') |
155 | : escape_(1,e), c_(1,c), quote_(1,q), last_(false) { } |
156 | |
157 | escaped_list_separator(string_type e, string_type c, string_type q) |
158 | : escape_(e), c_(c), quote_(q), last_(false) { } |
159 | |
160 | void reset() {last_=false;} |
161 | |
162 | template <typename InputIterator, typename Token> |
163 | bool operator()(InputIterator& next,InputIterator end,Token& tok) { |
164 | bool bInQuote = false; |
165 | tok = Token(); |
166 | |
167 | if (next == end) { |
168 | if (last_) { |
169 | last_ = false; |
170 | return true; |
171 | } |
172 | else |
173 | return false; |
174 | } |
175 | last_ = false; |
176 | for (;next != end;++next) { |
177 | if (is_escape(e: *next)) { |
178 | do_escape(next,end,tok); |
179 | } |
180 | else if (is_c(e: *next)) { |
181 | if (!bInQuote) { |
182 | // If we are not in quote, then we are done |
183 | ++next; |
184 | // The last character was a c, that means there is |
185 | // 1 more blank field |
186 | last_ = true; |
187 | return true; |
188 | } |
189 | else tok+=*next; |
190 | } |
191 | else if (is_quote(e: *next)) { |
192 | bInQuote=!bInQuote; |
193 | } |
194 | else { |
195 | tok += *next; |
196 | } |
197 | } |
198 | return true; |
199 | } |
200 | }; |
201 | |
202 | //=========================================================================== |
203 | // The classes here are used by offset_separator and char_separator to implement |
204 | // faster assigning of tokens using assign instead of += |
205 | |
206 | namespace tokenizer_detail { |
207 | //=========================================================================== |
208 | // Tokenizer was broken for wide character separators, at least on Windows, since |
209 | // CRT functions isspace etc only expect values in [0, 0xFF]. Debug build asserts |
210 | // if higher values are passed in. The traits extension class should take care of this. |
211 | // Assuming that the conditional will always get optimized out in the function |
212 | // implementations, argument types are not a problem since both forms of character classifiers |
213 | // expect an int. |
214 | |
215 | #if !defined(BOOST_NO_CWCTYPE) |
216 | template<typename traits, int N> |
217 | struct traits_extension_details : public traits { |
218 | typedef typename traits::char_type char_type; |
219 | static bool isspace(char_type c) |
220 | { |
221 | return std::iswspace(wc: c) != 0; |
222 | } |
223 | static bool ispunct(char_type c) |
224 | { |
225 | return std::iswpunct(wc: c) != 0; |
226 | } |
227 | }; |
228 | |
229 | template<typename traits> |
230 | struct traits_extension_details<traits, 1> : public traits { |
231 | typedef typename traits::char_type char_type; |
232 | static bool isspace(char_type c) |
233 | { |
234 | return std::isspace(c) != 0; |
235 | } |
236 | static bool ispunct(char_type c) |
237 | { |
238 | return std::ispunct(c) != 0; |
239 | } |
240 | }; |
241 | #endif |
242 | |
243 | |
244 | // In case there is no cwctype header, we implement the checks manually. |
245 | // We make use of the fact that the tested categories should fit in ASCII. |
246 | template<typename traits> |
247 | struct traits_extension : public traits { |
248 | typedef typename traits::char_type char_type; |
249 | static bool isspace(char_type c) |
250 | { |
251 | #if !defined(BOOST_NO_CWCTYPE) |
252 | return traits_extension_details<traits, sizeof(char_type)>::isspace(c); |
253 | #else |
254 | return static_cast< unsigned >(c) <= 255 && std::isspace(c) != 0; |
255 | #endif |
256 | } |
257 | |
258 | static bool ispunct(char_type c) |
259 | { |
260 | #if !defined(BOOST_NO_CWCTYPE) |
261 | return traits_extension_details<traits, sizeof(char_type)>::ispunct(c); |
262 | #else |
263 | return static_cast< unsigned >(c) <= 255 && std::ispunct(c) != 0; |
264 | #endif |
265 | } |
266 | }; |
267 | |
268 | // The assign_or_plus_equal struct contains functions that implement |
269 | // assign, +=, and clearing based on the iterator type. The |
270 | // generic case does nothing for plus_equal and clearing, while |
271 | // passing through the call for assign. |
272 | // |
273 | // When an input iterator is being used, the situation is reversed. |
274 | // The assign method does nothing, plus_equal invokes operator +=, |
275 | // and the clearing method sets the supplied token to the default |
276 | // token constructor's result. |
277 | // |
278 | |
279 | template<class IteratorTag> |
280 | struct assign_or_plus_equal { |
281 | template<class Iterator, class Token> |
282 | static void assign(Iterator b, Iterator e, Token &t) { |
283 | t.assign(b, e); |
284 | } |
285 | |
286 | template<class Token, class Value> |
287 | static void plus_equal(Token &, const Value &) { } |
288 | |
289 | // If we are doing an assign, there is no need for the |
290 | // the clear. |
291 | // |
292 | template<class Token> |
293 | static void clear(Token &) { } |
294 | }; |
295 | |
296 | template <> |
297 | struct assign_or_plus_equal<std::input_iterator_tag> { |
298 | template<class Iterator, class Token> |
299 | static void assign(Iterator , Iterator , Token &) { } |
300 | template<class Token, class Value> |
301 | static void plus_equal(Token &t, const Value &v) { |
302 | t += v; |
303 | } |
304 | template<class Token> |
305 | static void clear(Token &t) { |
306 | t = Token(); |
307 | } |
308 | }; |
309 | |
310 | |
311 | template<class Iterator> |
312 | struct pointer_iterator_category{ |
313 | typedef std::random_access_iterator_tag type; |
314 | }; |
315 | |
316 | |
317 | template<class Iterator> |
318 | struct class_iterator_category{ |
319 | typedef typename Iterator::iterator_category type; |
320 | }; |
321 | |
322 | |
323 | |
324 | // This portably gets the iterator_tag without partial template specialization |
325 | template<class Iterator> |
326 | struct get_iterator_category{ |
327 | typedef typename mpl::if_<is_pointer<Iterator>, |
328 | pointer_iterator_category<Iterator>, |
329 | class_iterator_category<Iterator> |
330 | >::type cat; |
331 | |
332 | typedef typename cat::type iterator_category; |
333 | }; |
334 | |
335 | |
336 | } // namespace tokenizer_detail |
337 | |
338 | |
339 | //=========================================================================== |
340 | // The offset_separator class, which is a model of TokenizerFunction. |
341 | // Offset breaks a string into tokens based on a range of offsets |
342 | |
343 | class offset_separator { |
344 | private: |
345 | |
346 | std::vector<int> offsets_; |
347 | unsigned int current_offset_; |
348 | bool wrap_offsets_; |
349 | bool return_partial_last_; |
350 | |
351 | public: |
352 | template <typename Iter> |
353 | offset_separator(Iter begin, Iter end, bool wrap_offsets = true, |
354 | bool return_partial_last = true) |
355 | : offsets_(begin,end), current_offset_(0), |
356 | wrap_offsets_(wrap_offsets), |
357 | return_partial_last_(return_partial_last) { } |
358 | |
359 | offset_separator() |
360 | : offsets_(1,1), current_offset_(), |
361 | wrap_offsets_(true), return_partial_last_(true) { } |
362 | |
363 | void reset() { |
364 | current_offset_ = 0; |
365 | } |
366 | |
367 | template <typename InputIterator, typename Token> |
368 | bool operator()(InputIterator& next, InputIterator end, Token& tok) |
369 | { |
370 | typedef tokenizer_detail::assign_or_plus_equal< |
371 | BOOST_DEDUCED_TYPENAME tokenizer_detail::get_iterator_category< |
372 | InputIterator |
373 | >::iterator_category |
374 | > assigner; |
375 | |
376 | BOOST_ASSERT(!offsets_.empty()); |
377 | |
378 | assigner::clear(tok); |
379 | InputIterator start(next); |
380 | |
381 | if (next == end) |
382 | return false; |
383 | |
384 | if (current_offset_ == offsets_.size()) |
385 | { |
386 | if (wrap_offsets_) |
387 | current_offset_=0; |
388 | else |
389 | return false; |
390 | } |
391 | |
392 | int c = offsets_[current_offset_]; |
393 | int i = 0; |
394 | for (; i < c; ++i) { |
395 | if (next == end)break; |
396 | assigner::plus_equal(tok,*next++); |
397 | } |
398 | assigner::assign(start,next,tok); |
399 | |
400 | if (!return_partial_last_) |
401 | if (i < (c-1) ) |
402 | return false; |
403 | |
404 | ++current_offset_; |
405 | return true; |
406 | } |
407 | }; |
408 | |
409 | |
410 | //=========================================================================== |
411 | // The char_separator class breaks a sequence of characters into |
412 | // tokens based on the character delimiters (very much like bad old |
413 | // strtok). A delimiter character can either be kept or dropped. A |
414 | // kept delimiter shows up as an output token, whereas a dropped |
415 | // delimiter does not. |
416 | |
417 | // This class replaces the char_delimiters_separator class. The |
418 | // constructor for the char_delimiters_separator class was too |
419 | // confusing and needed to be deprecated. However, because of the |
420 | // default arguments to the constructor, adding the new constructor |
421 | // would cause ambiguity, so instead I deprecated the whole class. |
422 | // The implementation of the class was also simplified considerably. |
423 | |
424 | enum empty_token_policy { drop_empty_tokens, keep_empty_tokens }; |
425 | |
426 | // The out of the box GCC 2.95 on cygwin does not have a char_traits class. |
427 | template <typename Char, |
428 | typename Tr = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type > |
429 | class char_separator |
430 | { |
431 | typedef tokenizer_detail::traits_extension<Tr> Traits; |
432 | typedef std::basic_string<Char,Tr> string_type; |
433 | public: |
434 | explicit |
435 | char_separator(const Char* dropped_delims, |
436 | const Char* kept_delims = 0, |
437 | empty_token_policy empty_tokens = drop_empty_tokens) |
438 | : m_dropped_delims(dropped_delims), |
439 | m_use_ispunct(false), |
440 | m_use_isspace(false), |
441 | m_empty_tokens(empty_tokens), |
442 | m_output_done(false) |
443 | { |
444 | // Borland workaround |
445 | if (kept_delims) |
446 | m_kept_delims = kept_delims; |
447 | } |
448 | |
449 | // use ispunct() for kept delimiters and isspace for dropped. |
450 | explicit |
451 | char_separator() |
452 | : m_use_ispunct(true), |
453 | m_use_isspace(true), |
454 | m_empty_tokens(drop_empty_tokens), |
455 | m_output_done(false) { } |
456 | |
457 | void reset() { } |
458 | |
459 | template <typename InputIterator, typename Token> |
460 | bool operator()(InputIterator& next, InputIterator end, Token& tok) |
461 | { |
462 | typedef tokenizer_detail::assign_or_plus_equal< |
463 | BOOST_DEDUCED_TYPENAME tokenizer_detail::get_iterator_category< |
464 | InputIterator |
465 | >::iterator_category |
466 | > assigner; |
467 | |
468 | assigner::clear(tok); |
469 | |
470 | // skip past all dropped_delims |
471 | if (m_empty_tokens == drop_empty_tokens) |
472 | for (; next != end && is_dropped(E: *next); ++next) |
473 | { } |
474 | |
475 | InputIterator start(next); |
476 | |
477 | if (m_empty_tokens == drop_empty_tokens) { |
478 | |
479 | if (next == end) |
480 | return false; |
481 | |
482 | |
483 | // if we are on a kept_delims move past it and stop |
484 | if (is_kept(E: *next)) { |
485 | assigner::plus_equal(tok,*next); |
486 | ++next; |
487 | } else |
488 | // append all the non delim characters |
489 | for (; next != end && !is_dropped(E: *next) && !is_kept(E: *next); ++next) |
490 | assigner::plus_equal(tok,*next); |
491 | } |
492 | else { // m_empty_tokens == keep_empty_tokens |
493 | |
494 | // Handle empty token at the end |
495 | if (next == end) |
496 | { |
497 | if (m_output_done == false) |
498 | { |
499 | m_output_done = true; |
500 | assigner::assign(start,next,tok); |
501 | return true; |
502 | } |
503 | else |
504 | return false; |
505 | } |
506 | |
507 | if (is_kept(E: *next)) { |
508 | if (m_output_done == false) |
509 | m_output_done = true; |
510 | else { |
511 | assigner::plus_equal(tok,*next); |
512 | ++next; |
513 | m_output_done = false; |
514 | } |
515 | } |
516 | else if (m_output_done == false && is_dropped(E: *next)) { |
517 | m_output_done = true; |
518 | } |
519 | else { |
520 | if (is_dropped(E: *next)) |
521 | start=++next; |
522 | for (; next != end && !is_dropped(E: *next) && !is_kept(E: *next); ++next) |
523 | assigner::plus_equal(tok,*next); |
524 | m_output_done = true; |
525 | } |
526 | } |
527 | assigner::assign(start,next,tok); |
528 | return true; |
529 | } |
530 | |
531 | private: |
532 | string_type m_kept_delims; |
533 | string_type m_dropped_delims; |
534 | bool m_use_ispunct; |
535 | bool m_use_isspace; |
536 | empty_token_policy m_empty_tokens; |
537 | bool m_output_done; |
538 | |
539 | bool is_kept(Char E) const |
540 | { |
541 | if (m_kept_delims.length()) |
542 | return m_kept_delims.find(E) != string_type::npos; |
543 | else if (m_use_ispunct) { |
544 | return Traits::ispunct(E) != 0; |
545 | } else |
546 | return false; |
547 | } |
548 | bool is_dropped(Char E) const |
549 | { |
550 | if (m_dropped_delims.length()) |
551 | return m_dropped_delims.find(E) != string_type::npos; |
552 | else if (m_use_isspace) { |
553 | return Traits::isspace(E) != 0; |
554 | } else |
555 | return false; |
556 | } |
557 | }; |
558 | |
559 | //=========================================================================== |
560 | // The following class is DEPRECATED, use class char_separators instead. |
561 | // |
562 | // The char_delimiters_separator class, which is a model of |
563 | // TokenizerFunction. char_delimiters_separator breaks a string |
564 | // into tokens based on character delimiters. There are 2 types of |
565 | // delimiters. returnable delimiters can be returned as |
566 | // tokens. These are often punctuation. nonreturnable delimiters |
567 | // cannot be returned as tokens. These are often whitespace |
568 | |
569 | // The out of the box GCC 2.95 on cygwin does not have a char_traits class. |
570 | template <class Char, |
571 | class Tr = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type > |
572 | class char_delimiters_separator { |
573 | private: |
574 | |
575 | typedef tokenizer_detail::traits_extension<Tr> Traits; |
576 | typedef std::basic_string<Char,Tr> string_type; |
577 | string_type returnable_; |
578 | string_type nonreturnable_; |
579 | bool return_delims_; |
580 | bool no_ispunct_; |
581 | bool no_isspace_; |
582 | |
583 | bool is_ret(Char E)const |
584 | { |
585 | if (returnable_.length()) |
586 | return returnable_.find(E) != string_type::npos; |
587 | else{ |
588 | if (no_ispunct_) {return false;} |
589 | else{ |
590 | int r = Traits::ispunct(E); |
591 | return r != 0; |
592 | } |
593 | } |
594 | } |
595 | bool is_nonret(Char E)const |
596 | { |
597 | if (nonreturnable_.length()) |
598 | return nonreturnable_.find(E) != string_type::npos; |
599 | else{ |
600 | if (no_isspace_) {return false;} |
601 | else{ |
602 | int r = Traits::isspace(E); |
603 | return r != 0; |
604 | } |
605 | } |
606 | } |
607 | |
608 | public: |
609 | explicit char_delimiters_separator(bool return_delims = false, |
610 | const Char* returnable = 0, |
611 | const Char* nonreturnable = 0) |
612 | : returnable_(returnable ? returnable : string_type().c_str()), |
613 | nonreturnable_(nonreturnable ? nonreturnable:string_type().c_str()), |
614 | return_delims_(return_delims), no_ispunct_(returnable!=0), |
615 | no_isspace_(nonreturnable!=0) { } |
616 | |
617 | void reset() { } |
618 | |
619 | public: |
620 | |
621 | template <typename InputIterator, typename Token> |
622 | bool operator()(InputIterator& next, InputIterator end,Token& tok) { |
623 | tok = Token(); |
624 | |
625 | // skip past all nonreturnable delims |
626 | // skip past the returnable only if we are not returning delims |
627 | for (;next!=end && ( is_nonret(E: *next) || (is_ret(E: *next) |
628 | && !return_delims_ ) );++next) { } |
629 | |
630 | if (next == end) { |
631 | return false; |
632 | } |
633 | |
634 | // if we are to return delims and we are one a returnable one |
635 | // move past it and stop |
636 | if (is_ret(E: *next) && return_delims_) { |
637 | tok+=*next; |
638 | ++next; |
639 | } |
640 | else |
641 | // append all the non delim characters |
642 | for (;next!=end && !is_nonret(E: *next) && !is_ret(E: *next);++next) |
643 | tok+=*next; |
644 | |
645 | |
646 | return true; |
647 | } |
648 | }; |
649 | |
650 | |
651 | } //namespace boost |
652 | |
653 | #endif |
654 | |