1 | // Copyright (c) 2001-2011 Hartmut Kaiser |
2 | // |
3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying |
4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
5 | |
6 | #if !defined(BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM) |
7 | #define BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM |
8 | |
9 | #if defined(_MSC_VER) |
10 | #pragma once |
11 | #endif |
12 | |
13 | #include <iosfwd> |
14 | |
15 | #include <boost/spirit/home/support/detail/lexer/generator.hpp> |
16 | #include <boost/spirit/home/support/detail/lexer/rules.hpp> |
17 | #include <boost/spirit/home/support/detail/lexer/consts.hpp> |
18 | #include <boost/spirit/home/support/unused.hpp> |
19 | |
20 | #include <boost/spirit/home/lex/lexer/lexertl/token.hpp> |
21 | #include <boost/spirit/home/lex/lexer/lexertl/functor.hpp> |
22 | #include <boost/spirit/home/lex/lexer/lexertl/functor_data.hpp> |
23 | #include <boost/spirit/home/lex/lexer/lexertl/iterator.hpp> |
24 | #if defined(BOOST_SPIRIT_LEXERTL_DEBUG) |
25 | #include <boost/spirit/home/support/detail/lexer/debug.hpp> |
26 | #endif |
27 | |
28 | #include <iterator> // for std::iterator_traits |
29 | |
30 | namespace boost { namespace spirit { namespace lex { namespace lexertl |
31 | { |
32 | /////////////////////////////////////////////////////////////////////////// |
33 | namespace detail |
34 | { |
35 | /////////////////////////////////////////////////////////////////////// |
36 | // The must_escape function checks if the given character value needs |
37 | // to be preceded by a backslash character to disable its special |
38 | // meaning in the context of a regular expression |
39 | /////////////////////////////////////////////////////////////////////// |
40 | template <typename Char> |
41 | inline bool must_escape(Char c) |
42 | { |
43 | // FIXME: more needed? |
44 | switch (c) { |
45 | case '+': case '/': case '*': case '?': |
46 | case '|': |
47 | case '(': case ')': |
48 | case '[': case ']': |
49 | case '{': case '}': |
50 | case '.': |
51 | case '^': case '$': |
52 | case '\\': |
53 | case '"': |
54 | return true; |
55 | |
56 | default: |
57 | break; |
58 | } |
59 | return false; |
60 | } |
61 | |
62 | /////////////////////////////////////////////////////////////////////// |
63 | // The escape function returns the string representation of the given |
64 | // character value, possibly escaped with a backslash character, to |
65 | // allow it being safely used in a regular expression definition. |
66 | /////////////////////////////////////////////////////////////////////// |
67 | template <typename Char> |
68 | inline std::basic_string<Char> escape(Char ch) |
69 | { |
70 | std::basic_string<Char> result(1, ch); |
71 | if (detail::must_escape(ch)) |
72 | { |
73 | typedef typename std::basic_string<Char>::size_type size_type; |
74 | result.insert((size_type)0, 1, '\\'); |
75 | } |
76 | return result; |
77 | } |
78 | |
79 | /////////////////////////////////////////////////////////////////////// |
80 | // |
81 | /////////////////////////////////////////////////////////////////////// |
82 | inline boost::lexer::regex_flags map_flags(unsigned int flags) |
83 | { |
84 | unsigned int retval = boost::lexer::none; |
85 | if (flags & match_flags::match_not_dot_newline) |
86 | retval |= boost::lexer::dot_not_newline; |
87 | if (flags & match_flags::match_icase) |
88 | retval |= boost::lexer::icase; |
89 | |
90 | return boost::lexer::regex_flags(retval); |
91 | } |
92 | } |
93 | |
94 | /////////////////////////////////////////////////////////////////////////// |
95 | template <typename Lexer, typename F> |
96 | bool generate_static(Lexer const& |
97 | , std::basic_ostream<typename Lexer::char_type>& |
98 | , typename Lexer::char_type const*, F); |
99 | |
100 | /////////////////////////////////////////////////////////////////////////// |
101 | // |
102 | // Every lexer type to be used as a lexer for Spirit has to conform to |
103 | // the following public interface: |
104 | // |
105 | // typedefs: |
106 | // iterator_type The type of the iterator exposed by this lexer. |
107 | // token_type The type of the tokens returned from the exposed |
108 | // iterators. |
109 | // |
110 | // functions: |
111 | // default constructor |
112 | // Since lexers are instantiated as base classes |
113 | // only it might be a good idea to make this |
114 | // constructor protected. |
115 | // begin, end Return a pair of iterators, when dereferenced |
116 | // returning the sequence of tokens recognized in |
117 | // the input stream given as the parameters to the |
118 | // begin() function. |
119 | // add_token Should add the definition of a token to be |
120 | // recognized by this lexer. |
121 | // clear Should delete all current token definitions |
122 | // associated with the given state of this lexer |
123 | // object. |
124 | // |
125 | // template parameters: |
126 | // Iterator The type of the iterator used to access the |
127 | // underlying character stream. |
128 | // Token The type of the tokens to be returned from the |
129 | // exposed token iterator. |
130 | // Functor The type of the InputPolicy to use to instantiate |
131 | // the multi_pass iterator type to be used as the |
132 | // token iterator (returned from begin()/end()). |
133 | // |
134 | /////////////////////////////////////////////////////////////////////////// |
135 | |
136 | /////////////////////////////////////////////////////////////////////////// |
137 | // |
138 | // The lexer class is a implementation of a Spirit.Lex lexer on |
139 | // top of Ben Hanson's lexertl library as outlined above (For more |
140 | // information about lexertl go here: http://www.benhanson.net/lexertl.html). |
141 | // |
142 | // This class is supposed to be used as the first and only template |
143 | // parameter while instantiating instances of a lex::lexer class. |
144 | // |
145 | /////////////////////////////////////////////////////////////////////////// |
146 | template <typename Token = token<> |
147 | , typename Iterator = typename Token::iterator_type |
148 | , typename Functor = functor<Token, lexertl::detail::data, Iterator> > |
149 | class lexer |
150 | { |
151 | private: |
152 | struct dummy { void true_() {} }; |
153 | typedef void (dummy::*safe_bool)(); |
154 | |
155 | static std::size_t const all_states_id = static_cast<std::size_t>(-2); |
156 | |
157 | public: |
158 | operator safe_bool() const |
159 | { return initialized_dfa_ ? &dummy::true_ : 0; } |
160 | |
161 | typedef typename std::iterator_traits<Iterator>::value_type char_type; |
162 | typedef std::basic_string<char_type> string_type; |
163 | |
164 | typedef boost::lexer::basic_rules<char_type> basic_rules_type; |
165 | |
166 | // Every lexer type to be used as a lexer for Spirit has to conform to |
167 | // a public interface . |
168 | typedef Token token_type; |
169 | typedef typename Token::id_type id_type; |
170 | typedef iterator<Functor> iterator_type; |
171 | |
172 | private: |
173 | #ifdef _MSC_VER |
174 | # pragma warning(push) |
175 | # pragma warning(disable: 4512) // assignment operator could not be generated. |
176 | #endif |
177 | // this type is purely used for the iterator_type construction below |
178 | struct iterator_data_type |
179 | { |
180 | typedef typename Functor::semantic_actions_type semantic_actions_type; |
181 | |
182 | iterator_data_type( |
183 | boost::lexer::basic_state_machine<char_type> const& sm |
184 | , boost::lexer::basic_rules<char_type> const& rules |
185 | , semantic_actions_type const& actions) |
186 | : state_machine_(sm), rules_(rules), actions_(actions) |
187 | {} |
188 | |
189 | boost::lexer::basic_state_machine<char_type> const& state_machine_; |
190 | boost::lexer::basic_rules<char_type> const& rules_; |
191 | semantic_actions_type const& actions_; |
192 | }; |
193 | #ifdef _MSC_VER |
194 | # pragma warning(pop) |
195 | #endif |
196 | |
197 | public: |
198 | // Return the start iterator usable for iterating over the generated |
199 | // tokens. |
200 | iterator_type begin(Iterator& first, Iterator const& last |
201 | , char_type const* initial_state = 0) const |
202 | { |
203 | if (!init_dfa()) // never minimize DFA for dynamic lexers |
204 | return iterator_type(); |
205 | |
206 | iterator_data_type iterator_data(state_machine_, rules_, actions_); |
207 | return iterator_type(iterator_data, first, last, initial_state); |
208 | } |
209 | |
210 | // Return the end iterator usable to stop iterating over the generated |
211 | // tokens. |
212 | iterator_type end() const |
213 | { |
214 | return iterator_type(); |
215 | } |
216 | |
217 | protected: |
218 | // Lexer instances can be created by means of a derived class only. |
219 | lexer(unsigned int flags) |
220 | : flags_(detail::map_flags(flags)) |
221 | , rules_(flags_) |
222 | , initialized_dfa_(false) |
223 | {} |
224 | |
225 | public: |
226 | // interface for token definition management |
227 | std::size_t add_token(char_type const* state, char_type tokendef, |
228 | std::size_t token_id, char_type const* targetstate) |
229 | { |
230 | add_state(state); |
231 | initialized_dfa_ = false; |
232 | if (state == all_states()) |
233 | return rules_.add(state, detail::escape(tokendef), token_id, rules_.dot()); |
234 | |
235 | if (0 == targetstate) |
236 | targetstate = state; |
237 | else |
238 | add_state(state: targetstate); |
239 | return rules_.add(state, detail::escape(tokendef), token_id, targetstate); |
240 | } |
241 | std::size_t add_token(char_type const* state, string_type const& tokendef, |
242 | std::size_t token_id, char_type const* targetstate) |
243 | { |
244 | add_state(state); |
245 | initialized_dfa_ = false; |
246 | if (state == all_states()) |
247 | return rules_.add(state, tokendef, token_id, rules_.dot()); |
248 | |
249 | if (0 == targetstate) |
250 | targetstate = state; |
251 | else |
252 | add_state(state: targetstate); |
253 | return rules_.add(state, tokendef, token_id, targetstate); |
254 | } |
255 | |
256 | // interface for pattern definition management |
257 | void add_pattern (char_type const* state, string_type const& name, |
258 | string_type const& patterndef) |
259 | { |
260 | add_state(state); |
261 | rules_.add_macro(name.c_str(), patterndef); |
262 | initialized_dfa_ = false; |
263 | } |
264 | |
265 | boost::lexer::rules const& get_rules() const { return rules_; } |
266 | |
267 | void clear(char_type const* state) |
268 | { |
269 | std::size_t s = rules_.state(state); |
270 | if (boost::lexer::npos != s) |
271 | rules_.clear(state); |
272 | initialized_dfa_ = false; |
273 | } |
274 | std::size_t add_state(char_type const* state) |
275 | { |
276 | if (state == all_states()) |
277 | return all_states_id; |
278 | |
279 | std::size_t stateid = rules_.state(state); |
280 | if (boost::lexer::npos == stateid) { |
281 | stateid = rules_.add_state(state); |
282 | initialized_dfa_ = false; |
283 | } |
284 | return stateid; |
285 | } |
286 | string_type initial_state() const |
287 | { |
288 | return string_type(rules_.initial()); |
289 | } |
290 | string_type all_states() const |
291 | { |
292 | return string_type(rules_.all_states()); |
293 | } |
294 | |
295 | // Register a semantic action with the given id |
296 | template <typename F> |
297 | void add_action(std::size_t unique_id, std::size_t state, F act) |
298 | { |
299 | // If you see an error here stating add_action is not a member of |
300 | // fusion::unused_type then you are probably having semantic actions |
301 | // attached to at least one token in the lexer definition without |
302 | // using the lex::lexertl::actor_lexer<> as its base class. |
303 | typedef typename Functor::wrap_action_type wrapper_type; |
304 | if (state == all_states_id) { |
305 | // add the action to all known states |
306 | typedef typename |
307 | basic_rules_type::string_size_t_map::const_iterator |
308 | state_iterator; |
309 | |
310 | std::size_t states = rules_.statemap().size(); |
311 | for (state_iterator it = rules_.statemap().begin(), |
312 | end = rules_.statemap().end(); it != end; ++it) { |
313 | for (std::size_t j = 0; j < states; ++j) |
314 | actions_.add_action(unique_id + j, it->second, wrapper_type::call(act)); |
315 | } |
316 | } |
317 | else { |
318 | actions_.add_action(unique_id, state, wrapper_type::call(act)); |
319 | } |
320 | } |
321 | // template <typename F> |
322 | // void add_action(std::size_t unique_id, char_type const* state, F act) |
323 | // { |
324 | // typedef typename Functor::wrap_action_type wrapper_type; |
325 | // actions_.add_action(unique_id, add_state(state), wrapper_type::call(act)); |
326 | // } |
327 | |
328 | // We do not minimize the state machine by default anymore because |
329 | // Ben said: "If you can afford to generate a lexer at runtime, there |
330 | // is little point in calling minimise." |
331 | // Go figure. |
332 | bool init_dfa(bool minimize = false) const |
333 | { |
334 | if (!initialized_dfa_) { |
335 | state_machine_.clear(); |
336 | typedef boost::lexer::basic_generator<char_type> generator; |
337 | generator::build (rules_, state_machine_); |
338 | if (minimize) |
339 | generator::minimise (state_machine_); |
340 | |
341 | #if defined(BOOST_SPIRIT_LEXERTL_DEBUG) |
342 | boost::lexer::debug::dump(state_machine_, std::cerr); |
343 | #endif |
344 | initialized_dfa_ = true; |
345 | |
346 | // // release memory held by rules description |
347 | // basic_rules_type rules; |
348 | // rules.init_state_info(rules_); // preserve states |
349 | // std::swap(rules, rules_); |
350 | } |
351 | return true; |
352 | } |
353 | |
354 | private: |
355 | // lexertl specific data |
356 | mutable boost::lexer::basic_state_machine<char_type> state_machine_; |
357 | boost::lexer::regex_flags flags_; |
358 | /*mutable*/ basic_rules_type rules_; |
359 | |
360 | typename Functor::semantic_actions_type actions_; |
361 | mutable bool initialized_dfa_; |
362 | |
363 | // generator functions must be able to access members directly |
364 | template <typename Lexer, typename F> |
365 | friend bool generate_static(Lexer const& |
366 | , std::basic_ostream<typename Lexer::char_type>& |
367 | , typename Lexer::char_type const*, F); |
368 | }; |
369 | |
370 | /////////////////////////////////////////////////////////////////////////// |
371 | // |
372 | // The actor_lexer class is another implementation of a Spirit.Lex |
373 | // lexer on top of Ben Hanson's lexertl library as outlined above (For |
374 | // more information about lexertl go here: |
375 | // http://www.benhanson.net/lexertl.html). |
376 | // |
377 | // The only difference to the lexer class above is that token_def |
378 | // definitions may have semantic (lexer) actions attached while being |
379 | // defined: |
380 | // |
381 | // int w; |
382 | // token_def word = "[^ \t\n]+"; |
383 | // self = word[++ref(w)]; // see example: word_count_lexer |
384 | // |
385 | // This class is supposed to be used as the first and only template |
386 | // parameter while instantiating instances of a lex::lexer class. |
387 | // |
388 | /////////////////////////////////////////////////////////////////////////// |
389 | template <typename Token = token<> |
390 | , typename Iterator = typename Token::iterator_type |
391 | , typename Functor = functor<Token, lexertl::detail::data, Iterator, mpl::true_> > |
392 | class actor_lexer : public lexer<Token, Iterator, Functor> |
393 | { |
394 | protected: |
395 | // Lexer instances can be created by means of a derived class only. |
396 | actor_lexer(unsigned int flags) |
397 | : lexer<Token, Iterator, Functor>(flags) {} |
398 | }; |
399 | |
400 | }}}} |
401 | |
402 | #endif |
403 | |