1 | // Copyright (c) 2001-2011 Hartmut Kaiser |
2 | // |
3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying |
4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
5 | |
6 | #if !defined(BOOST_SPIRIT_LEX_LEXER_MAR_13_2007_0145PM) |
7 | #define BOOST_SPIRIT_LEX_LEXER_MAR_13_2007_0145PM |
8 | |
9 | #if defined(_MSC_VER) |
10 | #pragma once |
11 | #endif |
12 | |
13 | #include <boost/spirit/home/support/info.hpp> |
14 | #include <boost/spirit/home/qi/skip_over.hpp> |
15 | #include <boost/spirit/home/qi/parser.hpp> |
16 | #include <boost/spirit/home/qi/detail/assign_to.hpp> |
17 | #include <boost/spirit/home/lex/reference.hpp> |
18 | #include <boost/spirit/home/lex/meta_compiler.hpp> |
19 | #include <boost/spirit/home/lex/lexer_type.hpp> |
20 | #include <boost/spirit/home/lex/lexer/token_def.hpp> |
21 | #include <boost/assert.hpp> |
22 | #include <boost/noncopyable.hpp> |
23 | #include <boost/fusion/include/vector.hpp> |
24 | #include <boost/mpl/assert.hpp> |
25 | #include <boost/proto/extends.hpp> |
26 | #include <boost/proto/traits.hpp> |
27 | #include <boost/range/iterator_range_core.hpp> |
28 | #include <iterator> // for std::iterator_traits |
29 | #include <string> |
30 | |
31 | namespace boost { namespace spirit { namespace lex |
32 | { |
33 | /////////////////////////////////////////////////////////////////////////// |
34 | namespace detail |
35 | { |
36 | /////////////////////////////////////////////////////////////////////// |
37 | #ifdef _MSC_VER |
38 | # pragma warning(push) |
39 | # pragma warning(disable: 4512) // assignment operator could not be generated. |
40 | #endif |
41 | template <typename LexerDef> |
42 | struct lexer_def_ |
43 | : proto::extends< |
44 | typename proto::terminal< |
45 | lex::reference<lexer_def_<LexerDef> const> |
46 | >::type |
47 | , lexer_def_<LexerDef> > |
48 | , qi::parser<lexer_def_<LexerDef> > |
49 | , lex::lexer_type<lexer_def_<LexerDef> > |
50 | { |
51 | private: |
52 | // avoid warnings about using 'this' in constructor |
53 | lexer_def_& this_() { return *this; } |
54 | |
55 | typedef typename LexerDef::char_type char_type; |
56 | typedef typename LexerDef::string_type string_type; |
57 | typedef typename LexerDef::id_type id_type; |
58 | |
59 | typedef lex::reference<lexer_def_ const> reference_; |
60 | typedef typename proto::terminal<reference_>::type terminal_type; |
61 | typedef proto::extends<terminal_type, lexer_def_> proto_base_type; |
62 | |
63 | reference_ alias() const |
64 | { |
65 | return reference_(*this); |
66 | } |
67 | |
68 | public: |
69 | // Qi interface: metafunction calculating parser attribute type |
70 | template <typename Context, typename Iterator> |
71 | struct attribute |
72 | { |
73 | // the return value of a token set contains the matched token |
74 | // id, and the corresponding pair of iterators |
75 | typedef typename Iterator::base_iterator_type iterator_type; |
76 | typedef |
77 | fusion::vector2<id_type, iterator_range<iterator_type> > |
78 | type; |
79 | }; |
80 | |
81 | // Qi interface: parse functionality |
82 | template <typename Iterator, typename Context |
83 | , typename Skipper, typename Attribute> |
84 | bool parse(Iterator& first, Iterator const& last |
85 | , Context& /*context*/, Skipper const& skipper |
86 | , Attribute& attr) const |
87 | { |
88 | qi::skip_over(first, last, skipper); // always do a pre-skip |
89 | |
90 | if (first != last) { |
91 | typedef typename |
92 | std::iterator_traits<Iterator>::value_type |
93 | token_type; |
94 | |
95 | token_type const& t = *first; |
96 | if (token_is_valid(t) && t.state() == first.get_state()) { |
97 | // any of the token definitions matched |
98 | spirit::traits::assign_to(t, attr); |
99 | ++first; |
100 | return true; |
101 | } |
102 | } |
103 | return false; |
104 | } |
105 | |
106 | // Qi interface: 'what' functionality |
107 | template <typename Context> |
108 | info what(Context& /*context*/) const |
109 | { |
110 | return info("lexer" ); |
111 | } |
112 | |
113 | private: |
114 | // allow to use the lexer.self.add("regex1", id1)("regex2", id2); |
115 | // syntax |
116 | struct adder |
117 | { |
118 | adder(lexer_def_& def_) |
119 | : def(def_) {} |
120 | |
121 | // Add a token definition based on a single character as given |
122 | // by the first parameter, the second parameter allows to |
123 | // specify the token id to use for the new token. If no token |
124 | // id is given the character code is used. |
125 | adder const& operator()(char_type c |
126 | , id_type token_id = id_type()) const |
127 | { |
128 | if (id_type() == token_id) |
129 | token_id = static_cast<id_type>(c); |
130 | def.def.add_token (def.state.c_str(), c, token_id |
131 | , def.targetstate.empty() ? 0 : def.targetstate.c_str()); |
132 | return *this; |
133 | } |
134 | |
135 | // Add a token definition based on a character sequence as |
136 | // given by the first parameter, the second parameter allows to |
137 | // specify the token id to use for the new token. If no token |
138 | // id is given this function will generate a unique id to be |
139 | // used as the token's id. |
140 | adder const& operator()(string_type const& s |
141 | , id_type token_id = id_type()) const |
142 | { |
143 | if (id_type() == token_id) |
144 | token_id = def.def.get_next_id(); |
145 | def.def.add_token (def.state.c_str(), s, token_id |
146 | , def.targetstate.empty() ? 0 : def.targetstate.c_str()); |
147 | return *this; |
148 | } |
149 | |
150 | template <typename Attribute> |
151 | adder const& operator()( |
152 | token_def<Attribute, char_type, id_type>& tokdef |
153 | , id_type token_id = id_type()) const |
154 | { |
155 | // make sure we have a token id |
156 | if (id_type() == token_id) { |
157 | if (id_type() == tokdef.id()) { |
158 | token_id = def.def.get_next_id(); |
159 | tokdef.id(token_id); |
160 | } |
161 | else { |
162 | token_id = tokdef.id(); |
163 | } |
164 | } |
165 | else { |
166 | // the following assertion makes sure that the token_def |
167 | // instance has not been assigned a different id earlier |
168 | BOOST_ASSERT(id_type() == tokdef.id() |
169 | || token_id == tokdef.id()); |
170 | tokdef.id(token_id); |
171 | } |
172 | |
173 | def.define(tokdef); |
174 | return *this; |
175 | } |
176 | |
177 | // template <typename F> |
178 | // adder const& operator()(char_type c, id_type token_id, F act) const |
179 | // { |
180 | // if (id_type() == token_id) |
181 | // token_id = def.def.get_next_id(); |
182 | // std::size_t unique_id = |
183 | // def.def.add_token (def.state.c_str(), s, token_id); |
184 | // def.def.add_action(unique_id, def.state.c_str(), act); |
185 | // return *this; |
186 | // } |
187 | |
188 | lexer_def_& def; |
189 | }; |
190 | friend struct adder; |
191 | |
192 | // allow to use lexer.self.add_pattern("pattern1", "regex1")(...); |
193 | // syntax |
194 | struct pattern_adder |
195 | { |
196 | pattern_adder(lexer_def_& def_) |
197 | : def(def_) {} |
198 | |
199 | pattern_adder const& operator()(string_type const& p |
200 | , string_type const& s) const |
201 | { |
202 | def.def.add_pattern (def.state.c_str(), p, s); |
203 | return *this; |
204 | } |
205 | |
206 | lexer_def_& def; |
207 | }; |
208 | friend struct pattern_adder; |
209 | |
210 | private: |
211 | // Helper function to invoke the necessary 2 step compilation |
212 | // process on token definition expressions |
213 | template <typename TokenExpr> |
214 | void compile2pass(TokenExpr const& expr) |
215 | { |
216 | expr.collect(def, state, targetstate); |
217 | expr.add_actions(def); |
218 | } |
219 | |
220 | public: |
221 | /////////////////////////////////////////////////////////////////// |
222 | template <typename Expr> |
223 | void define(Expr const& expr) |
224 | { |
225 | compile2pass(compile<lex::domain>(expr)); |
226 | } |
227 | |
228 | lexer_def_(LexerDef& def_, string_type const& state_ |
229 | , string_type const& targetstate_ = string_type()) |
230 | : proto_base_type(terminal_type::make(alias())) |
231 | , add(this_()), add_pattern(this_()), def(def_) |
232 | , state(state_), targetstate(targetstate_) |
233 | {} |
234 | |
235 | // allow to switch states |
236 | lexer_def_ operator()(char_type const* state_) const |
237 | { |
238 | return lexer_def_(def, state_); |
239 | } |
240 | lexer_def_ operator()(char_type const* state_ |
241 | , char_type const* targetstate_) const |
242 | { |
243 | return lexer_def_(def, state_, targetstate_); |
244 | } |
245 | lexer_def_ operator()(string_type const& state_ |
246 | , string_type const& targetstate_ = string_type()) const |
247 | { |
248 | return lexer_def_(def, state_, targetstate_); |
249 | } |
250 | |
251 | // allow to assign a token definition expression |
252 | template <typename Expr> |
253 | lexer_def_& operator= (Expr const& xpr) |
254 | { |
255 | // Report invalid expression error as early as possible. |
256 | // If you got an error_invalid_expression error message here, |
257 | // then the expression (expr) is not a valid spirit lex |
258 | // expression. |
259 | BOOST_SPIRIT_ASSERT_MATCH(lex::domain, Expr); |
260 | |
261 | def.clear(state.c_str()); |
262 | define(xpr); |
263 | return *this; |
264 | } |
265 | |
266 | // explicitly tell the lexer that the given state will be defined |
267 | // (useful in conjunction with "*") |
268 | std::size_t add_state(char_type const* state_ = 0) |
269 | { |
270 | return def.add_state(state_ ? state_ : def.initial_state().c_str()); |
271 | } |
272 | |
273 | adder add; |
274 | pattern_adder add_pattern; |
275 | |
276 | private: |
277 | LexerDef& def; |
278 | string_type state; |
279 | string_type targetstate; |
280 | }; |
281 | #ifdef _MSC_VER |
282 | # pragma warning(pop) |
283 | #endif |
284 | |
285 | #if defined(BOOST_NO_CXX11_RVALUE_REFERENCES) |
286 | // allow to assign a token definition expression |
287 | template <typename LexerDef, typename Expr> |
288 | inline lexer_def_<LexerDef>& |
289 | operator+= (lexer_def_<LexerDef>& lexdef, Expr& xpr) |
290 | { |
291 | // Report invalid expression error as early as possible. |
292 | // If you got an error_invalid_expression error message here, |
293 | // then the expression (expr) is not a valid spirit lex |
294 | // expression. |
295 | BOOST_SPIRIT_ASSERT_MATCH(lex::domain, Expr); |
296 | |
297 | lexdef.define(xpr); |
298 | return lexdef; |
299 | } |
300 | #else |
301 | // allow to assign a token definition expression |
302 | template <typename LexerDef, typename Expr> |
303 | inline lexer_def_<LexerDef>& |
304 | operator+= (lexer_def_<LexerDef>& lexdef, Expr&& xpr) |
305 | { |
306 | // Report invalid expression error as early as possible. |
307 | // If you got an error_invalid_expression error message here, |
308 | // then the expression (expr) is not a valid spirit lex |
309 | // expression. |
310 | BOOST_SPIRIT_ASSERT_MATCH(lex::domain, Expr); |
311 | |
312 | lexdef.define(xpr); |
313 | return lexdef; |
314 | } |
315 | #endif |
316 | |
317 | template <typename LexerDef, typename Expr> |
318 | inline lexer_def_<LexerDef>& |
319 | operator+= (lexer_def_<LexerDef>& lexdef, Expr const& xpr) |
320 | { |
321 | // Report invalid expression error as early as possible. |
322 | // If you got an error_invalid_expression error message here, |
323 | // then the expression (expr) is not a valid spirit lex |
324 | // expression. |
325 | BOOST_SPIRIT_ASSERT_MATCH(lex::domain, Expr); |
326 | |
327 | lexdef.define(xpr); |
328 | return lexdef; |
329 | } |
330 | } |
331 | |
332 | /////////////////////////////////////////////////////////////////////////// |
333 | // The match_flags flags are used to influence different matching |
334 | // modes of the lexer |
335 | struct match_flags |
336 | { |
337 | enum enum_type |
338 | { |
339 | match_default = 0, // no flags |
340 | match_not_dot_newline = 1, // the regex '.' doesn't match newlines |
341 | match_icase = 2 // all matching operations are case insensitive |
342 | }; |
343 | }; |
344 | |
345 | /////////////////////////////////////////////////////////////////////////// |
346 | // This represents a lexer object |
347 | /////////////////////////////////////////////////////////////////////////// |
348 | |
349 | /////////////////////////////////////////////////////////////////////////// |
350 | // This is the first token id automatically assigned by the library |
351 | // if needed |
352 | enum tokenids |
353 | { |
354 | min_token_id = 0x10000 |
355 | }; |
356 | |
357 | template <typename Lexer> |
358 | class lexer : public Lexer |
359 | { |
360 | private: |
361 | // avoid warnings about using 'this' in constructor |
362 | lexer& this_() { return *this; } |
363 | |
364 | std::size_t next_token_id; // has to be an integral type |
365 | |
366 | public: |
367 | typedef Lexer lexer_type; |
368 | typedef typename Lexer::id_type id_type; |
369 | typedef typename Lexer::char_type char_type; |
370 | typedef typename Lexer::iterator_type iterator_type; |
371 | typedef lexer base_type; |
372 | |
373 | typedef detail::lexer_def_<lexer> lexer_def; |
374 | typedef std::basic_string<char_type> string_type; |
375 | |
376 | // if `id_type` was specified but `first_id` is not provided |
377 | // the `min_token_id` value may be out of range for `id_type`, |
378 | // but it will be a problem only if unique ids feature is in use. |
379 | lexer(unsigned int flags = match_flags::match_default) |
380 | : lexer_type(flags) |
381 | , next_token_id(min_token_id) |
382 | , self(this_(), lexer_type::initial_state()) |
383 | {} |
384 | |
385 | lexer(unsigned int flags, id_type first_id) |
386 | : lexer_type(flags) |
387 | , next_token_id(first_id) |
388 | , self(this_(), lexer_type::initial_state()) |
389 | {} |
390 | |
391 | // access iterator interface |
392 | template <typename Iterator> |
393 | iterator_type begin(Iterator& first, Iterator const& last |
394 | , char_type const* initial_state = 0) const |
395 | { return this->lexer_type::begin(first, last, initial_state); } |
396 | iterator_type end() const |
397 | { return this->lexer_type::end(); } |
398 | |
399 | std::size_t map_state(char_type const* state) |
400 | { return this->lexer_type::add_state(state); } |
401 | |
402 | // create a unique token id |
403 | id_type get_next_id() { return id_type(next_token_id++); } |
404 | |
405 | lexer_def self; // allow for easy token definition |
406 | }; |
407 | |
408 | }}} |
409 | |
410 | #endif |
411 | |