1 | // Copyright (c) 2001-2010 Hartmut Kaiser |
2 | // |
3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying |
4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
5 | |
6 | // This example shows how to create a simple lexer recognizing a couple of |
7 | // different tokens aimed at a simple language and how to use this lexer with |
8 | // a grammar. It shows how to associate attributes to tokens and how to access the |
9 | // token attributes from inside the grammar. |
10 | // |
11 | // Additionally, this example demonstrates, how to define a token set usable |
12 | // as the skip parser during parsing, allowing to define several tokens to be |
13 | // ignored. |
14 | // |
15 | // The main purpose of this example is to show how inheritance can be used to |
16 | // overload parts of a base grammar and add token definitions to a base lexer. |
17 | // |
18 | // Further, it shows how you can use the 'omit' attribute type specifier |
19 | // for token definitions to force the token to have no attribute (expose an |
20 | // unused attribute). |
21 | // |
22 | // This example recognizes a very simple programming language having |
23 | // assignment statements and if and while control structures. Look at the file |
24 | // example5.input for an example. |
25 | |
26 | #include <boost/spirit/include/qi.hpp> |
27 | #include <boost/spirit/include/lex_lexertl.hpp> |
28 | #include <boost/phoenix/operator.hpp> |
29 | |
30 | #include <iostream> |
31 | #include <fstream> |
32 | #include <string> |
33 | |
34 | #include "example.hpp" |
35 | |
36 | using namespace boost::spirit; |
37 | using boost::phoenix::val; |
38 | |
39 | /////////////////////////////////////////////////////////////////////////////// |
40 | // Token definition base, defines all tokens for the base grammar below |
41 | /////////////////////////////////////////////////////////////////////////////// |
42 | template <typename Lexer> |
43 | struct example5_base_tokens : lex::lexer<Lexer> |
44 | { |
45 | protected: |
46 | // this lexer is supposed to be used as a base type only |
47 | example5_base_tokens() {} |
48 | |
49 | public: |
50 | void init_token_definitions() |
51 | { |
52 | // define the tokens to match |
53 | identifier = "[a-zA-Z_][a-zA-Z0-9_]*" ; |
54 | constant = "[0-9]+" ; |
55 | if_ = "if" ; |
56 | while_ = "while" ; |
57 | |
58 | // associate the tokens and the token set with the lexer |
59 | this->self += lex::token_def<>('(') | ')' | '{' | '}' | '=' | ';' | constant; |
60 | this->self += if_ | while_ | identifier; |
61 | |
62 | // define the whitespace to ignore (spaces, tabs, newlines and C-style |
63 | // comments) |
64 | this->self("WS" ) |
65 | = lex::token_def<>("[ \\t\\n]+" ) |
66 | | "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/" |
67 | ; |
68 | } |
69 | |
70 | // these tokens have no attribute |
71 | lex::token_def<lex::omit> if_, while_; |
72 | |
73 | // The following two tokens have an associated attribute type, 'identifier' |
74 | // carries a string (the identifier name) and 'constant' carries the |
75 | // matched integer value. |
76 | // |
77 | // Note: any token attribute type explicitly specified in a token_def<> |
78 | // declaration needs to be listed during token type definition as |
79 | // well (see the typedef for the token_type below). |
80 | // |
81 | // The conversion of the matched input to an instance of this type occurs |
82 | // once (on first access), which makes token attributes as efficient as |
83 | // possible. Moreover, token instances are constructed once by the lexer |
84 | // library. From this point on tokens are passed by reference only, |
85 | // avoiding them being copied around. |
86 | lex::token_def<std::string> identifier; |
87 | lex::token_def<unsigned int> constant; |
88 | }; |
89 | |
90 | /////////////////////////////////////////////////////////////////////////////// |
91 | // Grammar definition base, defines a basic language |
92 | /////////////////////////////////////////////////////////////////////////////// |
93 | template <typename Iterator, typename Lexer> |
94 | struct example5_base_grammar |
95 | : qi::grammar<Iterator, qi::in_state_skipper<Lexer> > |
96 | { |
97 | template <typename TokenDef> |
98 | example5_base_grammar(TokenDef const& tok) |
99 | : example5_base_grammar::base_type(program) |
100 | { |
101 | using boost::spirit::_val; |
102 | |
103 | program |
104 | = +block |
105 | ; |
106 | |
107 | block |
108 | = '{' >> *statement >> '}' |
109 | ; |
110 | |
111 | statement |
112 | = assignment |
113 | | if_stmt |
114 | | while_stmt |
115 | ; |
116 | |
117 | assignment |
118 | = (tok.identifier >> '=' >> expression >> ';') |
119 | [ |
120 | std::cout << val(t: "assignment statement to: " ) << _1 << "\n" |
121 | ] |
122 | ; |
123 | |
124 | if_stmt |
125 | = (tok.if_ >> '(' >> expression >> ')' >> block) |
126 | [ |
127 | std::cout << val(t: "if expression: " ) << _1 << "\n" |
128 | ] |
129 | ; |
130 | |
131 | while_stmt |
132 | = (tok.while_ >> '(' >> expression >> ')' >> block) |
133 | [ |
134 | std::cout << val(t: "while expression: " ) << _1 << "\n" |
135 | ] |
136 | ; |
137 | |
138 | // since expression has a variant return type accommodating for |
139 | // std::string and unsigned integer, both possible values may be |
140 | // returned to the calling rule |
141 | expression |
142 | = tok.identifier [ _val = _1 ] |
143 | | tok.constant [ _val = _1 ] |
144 | ; |
145 | } |
146 | |
147 | typedef qi::in_state_skipper<Lexer> skipper_type; |
148 | |
149 | qi::rule<Iterator, skipper_type> program, block, statement; |
150 | qi::rule<Iterator, skipper_type> assignment, if_stmt; |
151 | qi::rule<Iterator, skipper_type> while_stmt; |
152 | |
153 | // the expression is the only rule having a return value |
154 | typedef boost::variant<unsigned int, std::string> expression_type; |
155 | qi::rule<Iterator, expression_type(), skipper_type> expression; |
156 | }; |
157 | |
158 | /////////////////////////////////////////////////////////////////////////////// |
159 | // Token definition for derived lexer, defines additional tokens |
160 | /////////////////////////////////////////////////////////////////////////////// |
161 | template <typename Lexer> |
162 | struct example5_tokens : example5_base_tokens<Lexer> |
163 | { |
164 | typedef example5_base_tokens<Lexer> base_type; |
165 | |
166 | example5_tokens() |
167 | { |
168 | // define the additional token to match |
169 | else_ = "else" ; |
170 | |
171 | // associate the new token with the lexer, note we add 'else' before |
172 | // anything else to add it to the token set before the identifier |
173 | // token, otherwise "else" would be matched as an identifier |
174 | this->self = else_; |
175 | |
176 | // now add the token definitions from the base class |
177 | this->base_type::init_token_definitions(); |
178 | } |
179 | |
180 | // this token has no attribute |
181 | lex::token_def<lex::omit> else_; |
182 | }; |
183 | |
184 | /////////////////////////////////////////////////////////////////////////////// |
185 | // Derived grammar definition, defines a language extension |
186 | /////////////////////////////////////////////////////////////////////////////// |
187 | template <typename Iterator, typename Lexer> |
188 | struct example5_grammar : example5_base_grammar<Iterator, Lexer> |
189 | { |
190 | template <typename TokenDef> |
191 | example5_grammar(TokenDef const& tok) |
192 | : example5_base_grammar<Iterator, Lexer>(tok) |
193 | { |
194 | // we alter the if_stmt only |
195 | this->if_stmt |
196 | = this->if_stmt.copy() >> -(tok.else_ >> this->block) |
197 | ; |
198 | } |
199 | }; |
200 | |
201 | /////////////////////////////////////////////////////////////////////////////// |
202 | int main() |
203 | { |
204 | // iterator type used to expose the underlying input stream |
205 | typedef std::string::iterator base_iterator_type; |
206 | |
207 | // This is the lexer token type to use. The second template parameter lists |
208 | // all attribute types used for token_def's during token definition (see |
209 | // example5_base_tokens<> above). Here we use the predefined lexertl token |
210 | // type, but any compatible token type may be used instead. |
211 | // |
212 | // If you don't list any token attribute types in the following declaration |
213 | // (or just use the default token type: lexertl_token<base_iterator_type>) |
214 | // it will compile and work just fine, just a bit less efficient. This is |
215 | // because the token attribute will be generated from the matched input |
216 | // sequence every time it is requested. But as soon as you specify at |
217 | // least one token attribute type you'll have to list all attribute types |
218 | // used for token_def<> declarations in the token definition class above, |
219 | // otherwise compilation errors will occur. |
220 | typedef lex::lexertl::token< |
221 | base_iterator_type, boost::mpl::vector<unsigned int, std::string> |
222 | > token_type; |
223 | |
224 | // Here we use the lexertl based lexer engine. |
225 | typedef lex::lexertl::lexer<token_type> lexer_type; |
226 | |
227 | // This is the token definition type (derived from the given lexer type). |
228 | typedef example5_tokens<lexer_type> example5_tokens; |
229 | |
230 | // this is the iterator type exposed by the lexer |
231 | typedef example5_tokens::iterator_type iterator_type; |
232 | |
233 | // this is the type of the grammar to parse |
234 | typedef example5_grammar<iterator_type, example5_tokens::lexer_def> example5_grammar; |
235 | |
236 | // now we use the types defined above to create the lexer and grammar |
237 | // object instances needed to invoke the parsing process |
238 | example5_tokens tokens; // Our lexer |
239 | example5_grammar calc(tokens); // Our parser |
240 | |
241 | std::string str (read_from_file(infile: "example5.input" )); |
242 | |
243 | // At this point we generate the iterator pair used to expose the |
244 | // tokenized input stream. |
245 | std::string::iterator it = str.begin(); |
246 | iterator_type iter = tokens.begin(first&: it, last: str.end()); |
247 | iterator_type end = tokens.end(); |
248 | |
249 | // Parsing is done based on the token stream, not the character |
250 | // stream read from the input. |
251 | // Note how we use the lexer defined above as the skip parser. It must |
252 | // be explicitly wrapped inside a state directive, switching the lexer |
253 | // state for the duration of skipping whitespace. |
254 | std::string ws("WS" ); |
255 | bool r = qi::phrase_parse(first&: iter, last: end, expr&: calc, skipper: qi::in_state(ws)[tokens.self]); |
256 | |
257 | if (r && iter == end) |
258 | { |
259 | std::cout << "-------------------------\n" ; |
260 | std::cout << "Parsing succeeded\n" ; |
261 | std::cout << "-------------------------\n" ; |
262 | } |
263 | else |
264 | { |
265 | std::cout << "-------------------------\n" ; |
266 | std::cout << "Parsing failed\n" ; |
267 | std::cout << "-------------------------\n" ; |
268 | } |
269 | |
270 | std::cout << "Bye... :-) \n\n" ; |
271 | return 0; |
272 | } |
273 | |