| 1 | // Copyright (c) 2001-2010 Hartmut Kaiser |
| 2 | // |
| 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying |
| 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
| 5 | |
| 6 | // This example shows how to create a simple lexer recognizing a couple of |
| 7 | // different tokens aimed at a simple language and how to use this lexer with |
| 8 | // a grammar. It shows how to associate attributes to tokens and how to access |
| 9 | // the token attributes from inside the grammar. |
| 10 | // |
| 11 | // We use explicit token attribute types, making the corresponding token instances |
| 12 | // carry convert the matched input into an instance of that type. The token |
| 13 | // attribute is exposed as the parser attribute if this token is used as a |
| 14 | // parser component somewhere in a grammar. |
| 15 | // |
| 16 | // Additionally, this example demonstrates, how to define a token set usable |
| 17 | // as the skip parser during parsing, allowing to define several tokens to be |
| 18 | // ignored. |
| 19 | // |
| 20 | // This example recognizes a very simple programming language having |
| 21 | // assignment statements and if and while control structures. Look at the file |
| 22 | // example4.input for an example. |
| 23 | |
| 24 | #include <boost/spirit/include/qi.hpp> |
| 25 | #include <boost/spirit/include/lex_lexertl.hpp> |
| 26 | #include <boost/phoenix/operator.hpp> |
| 27 | |
| 28 | #include <iostream> |
| 29 | #include <fstream> |
| 30 | #include <string> |
| 31 | |
| 32 | #include "example.hpp" |
| 33 | |
| 34 | using namespace boost::spirit; |
| 35 | using boost::phoenix::val; |
| 36 | |
| 37 | /////////////////////////////////////////////////////////////////////////////// |
| 38 | // Token definition |
| 39 | /////////////////////////////////////////////////////////////////////////////// |
| 40 | template <typename Lexer> |
| 41 | struct example4_tokens : lex::lexer<Lexer> |
| 42 | { |
| 43 | example4_tokens() |
| 44 | { |
| 45 | // define the tokens to match |
| 46 | identifier = "[a-zA-Z_][a-zA-Z0-9_]*" ; |
| 47 | constant = "[0-9]+" ; |
| 48 | if_ = "if" ; |
| 49 | else_ = "else" ; |
| 50 | while_ = "while" ; |
| 51 | |
| 52 | // associate the tokens and the token set with the lexer |
| 53 | this->self = lex::token_def<>('(') | ')' | '{' | '}' | '=' | ';' | constant; |
| 54 | this->self += if_ | else_ | while_ | identifier; |
| 55 | |
| 56 | // define the whitespace to ignore (spaces, tabs, newlines and C-style |
| 57 | // comments) |
| 58 | this->self("WS" ) |
| 59 | = lex::token_def<>("[ \\t\\n]+" ) |
| 60 | | "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/" |
| 61 | ; |
| 62 | } |
| 63 | |
| 64 | //[example4_token_def |
| 65 | // these tokens expose the iterator_range of the matched input sequence |
| 66 | lex::token_def<> if_, else_, while_; |
| 67 | |
| 68 | // The following two tokens have an associated attribute type, 'identifier' |
| 69 | // carries a string (the identifier name) and 'constant' carries the |
| 70 | // matched integer value. |
| 71 | // |
| 72 | // Note: any token attribute type explicitly specified in a token_def<> |
| 73 | // declaration needs to be listed during token type definition as |
| 74 | // well (see the typedef for the token_type below). |
| 75 | // |
| 76 | // The conversion of the matched input to an instance of this type occurs |
| 77 | // once (on first access), which makes token attributes as efficient as |
| 78 | // possible. Moreover, token instances are constructed once by the lexer |
| 79 | // library. From this point on tokens are passed by reference only, |
| 80 | // avoiding them being copied around. |
| 81 | lex::token_def<std::string> identifier; |
| 82 | lex::token_def<unsigned int> constant; |
| 83 | //] |
| 84 | }; |
| 85 | |
| 86 | /////////////////////////////////////////////////////////////////////////////// |
| 87 | // Grammar definition |
| 88 | /////////////////////////////////////////////////////////////////////////////// |
| 89 | template <typename Iterator, typename Lexer> |
| 90 | struct example4_grammar |
| 91 | : qi::grammar<Iterator, qi::in_state_skipper<Lexer> > |
| 92 | { |
| 93 | template <typename TokenDef> |
| 94 | example4_grammar(TokenDef const& tok) |
| 95 | : example4_grammar::base_type(program) |
| 96 | { |
| 97 | using boost::spirit::_val; |
| 98 | |
| 99 | program |
| 100 | = +block |
| 101 | ; |
| 102 | |
| 103 | block |
| 104 | = '{' >> *statement >> '}' |
| 105 | ; |
| 106 | |
| 107 | statement |
| 108 | = assignment |
| 109 | | if_stmt |
| 110 | | while_stmt |
| 111 | ; |
| 112 | |
| 113 | assignment |
| 114 | = (tok.identifier >> '=' >> expression >> ';') |
| 115 | [ |
| 116 | std::cout << val(t: "assignment statement to: " ) << _1 << "\n" |
| 117 | ] |
| 118 | ; |
| 119 | |
| 120 | if_stmt |
| 121 | = ( tok.if_ >> '(' >> expression >> ')' >> block |
| 122 | >> -(tok.else_ >> block) |
| 123 | ) |
| 124 | [ |
| 125 | std::cout << val(t: "if expression: " ) << _2 << "\n" |
| 126 | ] |
| 127 | ; |
| 128 | |
| 129 | while_stmt |
| 130 | = (tok.while_ >> '(' >> expression >> ')' >> block) |
| 131 | [ |
| 132 | std::cout << val(t: "while expression: " ) << _2 << "\n" |
| 133 | ] |
| 134 | ; |
| 135 | |
| 136 | // since expression has a variant return type accommodating for |
| 137 | // std::string and unsigned integer, both possible values may be |
| 138 | // returned to the calling rule |
| 139 | expression |
| 140 | = tok.identifier [ _val = _1 ] |
| 141 | | tok.constant [ _val = _1 ] |
| 142 | ; |
| 143 | } |
| 144 | |
| 145 | typedef boost::variant<unsigned int, std::string> expression_type; |
| 146 | |
| 147 | qi::rule<Iterator, qi::in_state_skipper<Lexer> > program, block, statement; |
| 148 | qi::rule<Iterator, qi::in_state_skipper<Lexer> > assignment, if_stmt; |
| 149 | qi::rule<Iterator, qi::in_state_skipper<Lexer> > while_stmt; |
| 150 | |
| 151 | // the expression is the only rule having a return value |
| 152 | qi::rule<Iterator, expression_type(), qi::in_state_skipper<Lexer> > expression; |
| 153 | }; |
| 154 | |
| 155 | /////////////////////////////////////////////////////////////////////////////// |
| 156 | int main() |
| 157 | { |
| 158 | // iterator type used to expose the underlying input stream |
| 159 | typedef std::string::iterator base_iterator_type; |
| 160 | |
| 161 | //[example4_token |
| 162 | // This is the lexer token type to use. The second template parameter lists |
| 163 | // all attribute types used for token_def's during token definition (see |
| 164 | // calculator_tokens<> above). Here we use the predefined lexertl token |
| 165 | // type, but any compatible token type may be used instead. |
| 166 | // |
| 167 | // If you don't list any token attribute types in the following declaration |
| 168 | // (or just use the default token type: lexertl_token<base_iterator_type>) |
| 169 | // it will compile and work just fine, just a bit less efficient. This is |
| 170 | // because the token attribute will be generated from the matched input |
| 171 | // sequence every time it is requested. But as soon as you specify at |
| 172 | // least one token attribute type you'll have to list all attribute types |
| 173 | // used for token_def<> declarations in the token definition class above, |
| 174 | // otherwise compilation errors will occur. |
| 175 | typedef lex::lexertl::token< |
| 176 | base_iterator_type, boost::mpl::vector<unsigned int, std::string> |
| 177 | > token_type; |
| 178 | //] |
| 179 | // Here we use the lexertl based lexer engine. |
| 180 | typedef lex::lexertl::lexer<token_type> lexer_type; |
| 181 | |
| 182 | // This is the token definition type (derived from the given lexer type). |
| 183 | typedef example4_tokens<lexer_type> example4_tokens; |
| 184 | |
| 185 | // this is the iterator type exposed by the lexer |
| 186 | typedef example4_tokens::iterator_type iterator_type; |
| 187 | |
| 188 | // this is the type of the grammar to parse |
| 189 | typedef example4_grammar<iterator_type, example4_tokens::lexer_def> example4_grammar; |
| 190 | |
| 191 | // now we use the types defined above to create the lexer and grammar |
| 192 | // object instances needed to invoke the parsing process |
| 193 | example4_tokens tokens; // Our lexer |
| 194 | example4_grammar calc(tokens); // Our parser |
| 195 | |
| 196 | std::string str (read_from_file(infile: "example4.input" )); |
| 197 | |
| 198 | // At this point we generate the iterator pair used to expose the |
| 199 | // tokenized input stream. |
| 200 | std::string::iterator it = str.begin(); |
| 201 | iterator_type iter = tokens.begin(first&: it, last: str.end()); |
| 202 | iterator_type end = tokens.end(); |
| 203 | |
| 204 | // Parsing is done based on the token stream, not the character |
| 205 | // stream read from the input. |
| 206 | // Note how we use the lexer defined above as the skip parser. It must |
| 207 | // be explicitly wrapped inside a state directive, switching the lexer |
| 208 | // state for the duration of skipping whitespace. |
| 209 | bool r = qi::phrase_parse(first&: iter, last: end, expr&: calc, skipper: qi::in_state("WS" )[tokens.self]); |
| 210 | |
| 211 | if (r && iter == end) |
| 212 | { |
| 213 | std::cout << "-------------------------\n" ; |
| 214 | std::cout << "Parsing succeeded\n" ; |
| 215 | std::cout << "-------------------------\n" ; |
| 216 | } |
| 217 | else |
| 218 | { |
| 219 | std::cout << "-------------------------\n" ; |
| 220 | std::cout << "Parsing failed\n" ; |
| 221 | std::cout << "-------------------------\n" ; |
| 222 | } |
| 223 | |
| 224 | std::cout << "Bye... :-) \n\n" ; |
| 225 | return 0; |
| 226 | } |
| 227 | |