1// Copyright (c) 2001-2011 Hartmut Kaiser
2//
3// Distributed under the Boost Software License, Version 1.0. (See accompanying
4// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5
6#if !defined(BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM)
7#define BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM
8
9#if defined(_MSC_VER)
10#pragma once
11#endif
12
13#include <iosfwd>
14
15#include <boost/spirit/home/support/detail/lexer/generator.hpp>
16#include <boost/spirit/home/support/detail/lexer/rules.hpp>
17#include <boost/spirit/home/support/detail/lexer/consts.hpp>
18#include <boost/spirit/home/support/unused.hpp>
19
20#include <boost/spirit/home/lex/lexer/lexertl/token.hpp>
21#include <boost/spirit/home/lex/lexer/lexertl/functor.hpp>
22#include <boost/spirit/home/lex/lexer/lexertl/functor_data.hpp>
23#include <boost/spirit/home/lex/lexer/lexertl/iterator.hpp>
24#if defined(BOOST_SPIRIT_LEXERTL_DEBUG)
25#include <boost/spirit/home/support/detail/lexer/debug.hpp>
26#endif
27
28#include <iterator> // for std::iterator_traits
29
30namespace boost { namespace spirit { namespace lex { namespace lexertl
31{
32 ///////////////////////////////////////////////////////////////////////////
33 namespace detail
34 {
35 ///////////////////////////////////////////////////////////////////////
36 // The must_escape function checks if the given character value needs
37 // to be preceded by a backslash character to disable its special
38 // meaning in the context of a regular expression
39 ///////////////////////////////////////////////////////////////////////
40 template <typename Char>
41 inline bool must_escape(Char c)
42 {
43 // FIXME: more needed?
44 switch (c) {
45 case '+': case '/': case '*': case '?':
46 case '|':
47 case '(': case ')':
48 case '[': case ']':
49 case '{': case '}':
50 case '.':
51 case '^': case '$':
52 case '\\':
53 case '"':
54 return true;
55
56 default:
57 break;
58 }
59 return false;
60 }
61
62 ///////////////////////////////////////////////////////////////////////
63 // The escape function returns the string representation of the given
64 // character value, possibly escaped with a backslash character, to
65 // allow it being safely used in a regular expression definition.
66 ///////////////////////////////////////////////////////////////////////
67 template <typename Char>
68 inline std::basic_string<Char> escape(Char ch)
69 {
70 std::basic_string<Char> result(1, ch);
71 if (detail::must_escape(ch))
72 {
73 typedef typename std::basic_string<Char>::size_type size_type;
74 result.insert((size_type)0, 1, '\\');
75 }
76 return result;
77 }
78
79 ///////////////////////////////////////////////////////////////////////
80 //
81 ///////////////////////////////////////////////////////////////////////
82 inline boost::lexer::regex_flags map_flags(unsigned int flags)
83 {
84 unsigned int retval = boost::lexer::none;
85 if (flags & match_flags::match_not_dot_newline)
86 retval |= boost::lexer::dot_not_newline;
87 if (flags & match_flags::match_icase)
88 retval |= boost::lexer::icase;
89
90 return boost::lexer::regex_flags(retval);
91 }
92 }
93
94 ///////////////////////////////////////////////////////////////////////////
95 template <typename Lexer, typename F>
96 bool generate_static(Lexer const&
97 , std::basic_ostream<typename Lexer::char_type>&
98 , typename Lexer::char_type const*, F);
99
100 ///////////////////////////////////////////////////////////////////////////
101 //
102 // Every lexer type to be used as a lexer for Spirit has to conform to
103 // the following public interface:
104 //
105 // typedefs:
106 // iterator_type The type of the iterator exposed by this lexer.
107 // token_type The type of the tokens returned from the exposed
108 // iterators.
109 //
110 // functions:
111 // default constructor
112 // Since lexers are instantiated as base classes
113 // only it might be a good idea to make this
114 // constructor protected.
115 // begin, end Return a pair of iterators, when dereferenced
116 // returning the sequence of tokens recognized in
117 // the input stream given as the parameters to the
118 // begin() function.
119 // add_token Should add the definition of a token to be
120 // recognized by this lexer.
121 // clear Should delete all current token definitions
122 // associated with the given state of this lexer
123 // object.
124 //
125 // template parameters:
126 // Iterator The type of the iterator used to access the
127 // underlying character stream.
128 // Token The type of the tokens to be returned from the
129 // exposed token iterator.
130 // Functor The type of the InputPolicy to use to instantiate
131 // the multi_pass iterator type to be used as the
132 // token iterator (returned from begin()/end()).
133 //
134 ///////////////////////////////////////////////////////////////////////////
135
136 ///////////////////////////////////////////////////////////////////////////
137 //
138 // The lexer class is a implementation of a Spirit.Lex lexer on
139 // top of Ben Hanson's lexertl library as outlined above (For more
140 // information about lexertl go here: http://www.benhanson.net/lexertl.html).
141 //
142 // This class is supposed to be used as the first and only template
143 // parameter while instantiating instances of a lex::lexer class.
144 //
145 ///////////////////////////////////////////////////////////////////////////
146 template <typename Token = token<>
147 , typename Iterator = typename Token::iterator_type
148 , typename Functor = functor<Token, lexertl::detail::data, Iterator> >
149 class lexer
150 {
151 private:
152 struct dummy { void true_() {} };
153 typedef void (dummy::*safe_bool)();
154
155 static std::size_t const all_states_id = static_cast<std::size_t>(-2);
156
157 public:
158 operator safe_bool() const
159 { return initialized_dfa_ ? &dummy::true_ : 0; }
160
161 typedef typename std::iterator_traits<Iterator>::value_type char_type;
162 typedef std::basic_string<char_type> string_type;
163
164 typedef boost::lexer::basic_rules<char_type> basic_rules_type;
165
166 // Every lexer type to be used as a lexer for Spirit has to conform to
167 // a public interface .
168 typedef Token token_type;
169 typedef typename Token::id_type id_type;
170 typedef iterator<Functor> iterator_type;
171
172 private:
173#ifdef _MSC_VER
174# pragma warning(push)
175# pragma warning(disable: 4512) // assignment operator could not be generated.
176#endif
177 // this type is purely used for the iterator_type construction below
178 struct iterator_data_type
179 {
180 typedef typename Functor::semantic_actions_type semantic_actions_type;
181
182 iterator_data_type(
183 boost::lexer::basic_state_machine<char_type> const& sm
184 , boost::lexer::basic_rules<char_type> const& rules
185 , semantic_actions_type const& actions)
186 : state_machine_(sm), rules_(rules), actions_(actions)
187 {}
188
189 boost::lexer::basic_state_machine<char_type> const& state_machine_;
190 boost::lexer::basic_rules<char_type> const& rules_;
191 semantic_actions_type const& actions_;
192 };
193#ifdef _MSC_VER
194# pragma warning(pop)
195#endif
196
197 public:
198 // Return the start iterator usable for iterating over the generated
199 // tokens.
200 iterator_type begin(Iterator& first, Iterator const& last
201 , char_type const* initial_state = 0) const
202 {
203 if (!init_dfa()) // never minimize DFA for dynamic lexers
204 return iterator_type();
205
206 iterator_data_type iterator_data(state_machine_, rules_, actions_);
207 return iterator_type(iterator_data, first, last, initial_state);
208 }
209
210 // Return the end iterator usable to stop iterating over the generated
211 // tokens.
212 iterator_type end() const
213 {
214 return iterator_type();
215 }
216
217 protected:
218 // Lexer instances can be created by means of a derived class only.
219 lexer(unsigned int flags)
220 : flags_(detail::map_flags(flags))
221 , rules_(flags_)
222 , initialized_dfa_(false)
223 {}
224
225 public:
226 // interface for token definition management
227 std::size_t add_token(char_type const* state, char_type tokendef,
228 std::size_t token_id, char_type const* targetstate)
229 {
230 add_state(state);
231 initialized_dfa_ = false;
232 if (state == all_states())
233 return rules_.add(state, detail::escape(tokendef), token_id, rules_.dot());
234
235 if (0 == targetstate)
236 targetstate = state;
237 else
238 add_state(state: targetstate);
239 return rules_.add(state, detail::escape(tokendef), token_id, targetstate);
240 }
241 std::size_t add_token(char_type const* state, string_type const& tokendef,
242 std::size_t token_id, char_type const* targetstate)
243 {
244 add_state(state);
245 initialized_dfa_ = false;
246 if (state == all_states())
247 return rules_.add(state, tokendef, token_id, rules_.dot());
248
249 if (0 == targetstate)
250 targetstate = state;
251 else
252 add_state(state: targetstate);
253 return rules_.add(state, tokendef, token_id, targetstate);
254 }
255
256 // interface for pattern definition management
257 void add_pattern (char_type const* state, string_type const& name,
258 string_type const& patterndef)
259 {
260 add_state(state);
261 rules_.add_macro(name.c_str(), patterndef);
262 initialized_dfa_ = false;
263 }
264
265 boost::lexer::rules const& get_rules() const { return rules_; }
266
267 void clear(char_type const* state)
268 {
269 std::size_t s = rules_.state(state);
270 if (boost::lexer::npos != s)
271 rules_.clear(state);
272 initialized_dfa_ = false;
273 }
274 std::size_t add_state(char_type const* state)
275 {
276 if (state == all_states())
277 return all_states_id;
278
279 std::size_t stateid = rules_.state(state);
280 if (boost::lexer::npos == stateid) {
281 stateid = rules_.add_state(state);
282 initialized_dfa_ = false;
283 }
284 return stateid;
285 }
286 string_type initial_state() const
287 {
288 return string_type(rules_.initial());
289 }
290 string_type all_states() const
291 {
292 return string_type(rules_.all_states());
293 }
294
295 // Register a semantic action with the given id
296 template <typename F>
297 void add_action(std::size_t unique_id, std::size_t state, F act)
298 {
299 // If you see an error here stating add_action is not a member of
300 // fusion::unused_type then you are probably having semantic actions
301 // attached to at least one token in the lexer definition without
302 // using the lex::lexertl::actor_lexer<> as its base class.
303 typedef typename Functor::wrap_action_type wrapper_type;
304 if (state == all_states_id) {
305 // add the action to all known states
306 typedef typename
307 basic_rules_type::string_size_t_map::const_iterator
308 state_iterator;
309
310 std::size_t states = rules_.statemap().size();
311 for (state_iterator it = rules_.statemap().begin(),
312 end = rules_.statemap().end(); it != end; ++it) {
313 for (std::size_t j = 0; j < states; ++j)
314 actions_.add_action(unique_id + j, it->second, wrapper_type::call(act));
315 }
316 }
317 else {
318 actions_.add_action(unique_id, state, wrapper_type::call(act));
319 }
320 }
321// template <typename F>
322// void add_action(std::size_t unique_id, char_type const* state, F act)
323// {
324// typedef typename Functor::wrap_action_type wrapper_type;
325// actions_.add_action(unique_id, add_state(state), wrapper_type::call(act));
326// }
327
328 // We do not minimize the state machine by default anymore because
329 // Ben said: "If you can afford to generate a lexer at runtime, there
330 // is little point in calling minimise."
331 // Go figure.
332 bool init_dfa(bool minimize = false) const
333 {
334 if (!initialized_dfa_) {
335 state_machine_.clear();
336 typedef boost::lexer::basic_generator<char_type> generator;
337 generator::build (rules_, state_machine_);
338 if (minimize)
339 generator::minimise (state_machine_);
340
341#if defined(BOOST_SPIRIT_LEXERTL_DEBUG)
342 boost::lexer::debug::dump(state_machine_, std::cerr);
343#endif
344 initialized_dfa_ = true;
345
346// // release memory held by rules description
347// basic_rules_type rules;
348// rules.init_state_info(rules_); // preserve states
349// std::swap(rules, rules_);
350 }
351 return true;
352 }
353
354 private:
355 // lexertl specific data
356 mutable boost::lexer::basic_state_machine<char_type> state_machine_;
357 boost::lexer::regex_flags flags_;
358 /*mutable*/ basic_rules_type rules_;
359
360 typename Functor::semantic_actions_type actions_;
361 mutable bool initialized_dfa_;
362
363 // generator functions must be able to access members directly
364 template <typename Lexer, typename F>
365 friend bool generate_static(Lexer const&
366 , std::basic_ostream<typename Lexer::char_type>&
367 , typename Lexer::char_type const*, F);
368 };
369
370 ///////////////////////////////////////////////////////////////////////////
371 //
372 // The actor_lexer class is another implementation of a Spirit.Lex
373 // lexer on top of Ben Hanson's lexertl library as outlined above (For
374 // more information about lexertl go here:
375 // http://www.benhanson.net/lexertl.html).
376 //
377 // The only difference to the lexer class above is that token_def
378 // definitions may have semantic (lexer) actions attached while being
379 // defined:
380 //
381 // int w;
382 // token_def word = "[^ \t\n]+";
383 // self = word[++ref(w)]; // see example: word_count_lexer
384 //
385 // This class is supposed to be used as the first and only template
386 // parameter while instantiating instances of a lex::lexer class.
387 //
388 ///////////////////////////////////////////////////////////////////////////
389 template <typename Token = token<>
390 , typename Iterator = typename Token::iterator_type
391 , typename Functor = functor<Token, lexertl::detail::data, Iterator, mpl::true_> >
392 class actor_lexer : public lexer<Token, Iterator, Functor>
393 {
394 protected:
395 // Lexer instances can be created by means of a derived class only.
396 actor_lexer(unsigned int flags)
397 : lexer<Token, Iterator, Functor>(flags) {}
398 };
399
400}}}}
401
402#endif
403

source code of boost/libs/spirit/include/boost/spirit/home/lex/lexer/lexertl/lexer.hpp