| 1 | // Copyright (c) 2001-2010 Hartmut Kaiser |
| 2 | // |
| 3 | // Distributed under the Boost Software License, Version 1.0. (See accompanying |
| 4 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
| 5 | |
| 6 | // This example is the equivalent to the following flex program: |
| 7 | /* |
| 8 | //[wcf_flex_version |
| 9 | %{ |
| 10 | #define ID_WORD 1000 |
| 11 | #define ID_EOL 1001 |
| 12 | #define ID_CHAR 1002 |
| 13 | int c = 0, w = 0, l = 0; |
| 14 | %} |
| 15 | %% |
| 16 | [^ \t\n]+ { return ID_WORD; } |
| 17 | \n { return ID_EOL; } |
| 18 | . { return ID_CHAR; } |
| 19 | %% |
| 20 | bool count(int tok) |
| 21 | { |
| 22 | switch (tok) { |
| 23 | case ID_WORD: ++w; c += yyleng; break; |
| 24 | case ID_EOL: ++l; ++c; break; |
| 25 | case ID_CHAR: ++c; break; |
| 26 | default: |
| 27 | return false; |
| 28 | } |
| 29 | return true; |
| 30 | } |
| 31 | void main() |
| 32 | { |
| 33 | int tok = EOF; |
| 34 | do { |
| 35 | tok = yylex(); |
| 36 | if (!count(tok)) |
| 37 | break; |
| 38 | } while (EOF != tok); |
| 39 | printf("%d %d %d\n", l, w, c); |
| 40 | } |
| 41 | //] |
| 42 | */ |
| 43 | // Its purpose is to do the word count function of the wc command in UNIX. It |
| 44 | // prints the number of lines, words and characters in a file. |
| 45 | // |
| 46 | // This examples shows how to use the tokenize() function together with a |
| 47 | // simple functor, which gets executed whenever a token got matched in the |
| 48 | // input sequence. |
| 49 | |
| 50 | // #define BOOST_SPIRIT_LEXERTL_DEBUG |
| 51 | |
| 52 | //[wcf_includes |
| 53 | #include <boost/spirit/include/lex_lexertl.hpp> |
| 54 | #include <boost/bind/bind.hpp> |
| 55 | #include <boost/ref.hpp> |
| 56 | //] |
| 57 | |
| 58 | #include <iostream> |
| 59 | #include <string> |
| 60 | |
| 61 | #include "example.hpp" |
| 62 | |
| 63 | //[wcf_namespaces |
| 64 | namespace lex = boost::spirit::lex; |
| 65 | //] |
| 66 | |
| 67 | /////////////////////////////////////////////////////////////////////////////// |
| 68 | // Token id definitions |
| 69 | /////////////////////////////////////////////////////////////////////////////// |
| 70 | //[wcf_token_ids |
| 71 | enum token_ids |
| 72 | { |
| 73 | ID_WORD = 1000, |
| 74 | ID_EOL, |
| 75 | ID_CHAR |
| 76 | }; |
| 77 | //] |
| 78 | |
| 79 | //[wcf_token_definition |
| 80 | /*` The template `word_count_tokens` defines three different tokens: |
| 81 | `ID_WORD`, `ID_EOL`, and `ID_CHAR`, representing a word (anything except |
| 82 | a whitespace or a newline), a newline character, and any other character |
| 83 | (`ID_WORD`, `ID_EOL`, and `ID_CHAR` are enum values representing the token |
| 84 | ids, but could be anything else convertible to an integer as well). |
| 85 | The direct base class of any token definition class needs to be the |
| 86 | template `lex::lexer<>`, where the corresponding template parameter (here: |
| 87 | `lex::lexertl::lexer<BaseIterator>`) defines which underlying lexer engine has |
| 88 | to be used to provide the required state machine functionality. In this |
| 89 | example we use the Lexertl based lexer engine as the underlying lexer type. |
| 90 | */ |
| 91 | template <typename Lexer> |
| 92 | struct word_count_tokens : lex::lexer<Lexer> |
| 93 | { |
| 94 | word_count_tokens() |
| 95 | { |
| 96 | // define tokens (the regular expression to match and the corresponding |
| 97 | // token id) and add them to the lexer |
| 98 | this->self.add |
| 99 | ("[^ \t\n]+" , ID_WORD) // words (anything except ' ', '\t' or '\n') |
| 100 | ("\n" , ID_EOL) // newline characters |
| 101 | ("." , ID_CHAR) // anything else is a plain character |
| 102 | ; |
| 103 | } |
| 104 | }; |
| 105 | //] |
| 106 | |
| 107 | //[wcf_functor |
| 108 | /*` In this example the struct 'counter' is used as a functor counting the |
| 109 | characters, words and lines in the analyzed input sequence by identifying |
| 110 | the matched tokens as passed from the /Spirit.Lex/ library. |
| 111 | */ |
| 112 | struct counter |
| 113 | { |
| 114 | //<- this is an implementation detail specific to boost::bind and doesn't show |
| 115 | // up in the documentation |
| 116 | typedef bool result_type; |
| 117 | //-> |
| 118 | // the function operator gets called for each of the matched tokens |
| 119 | // c, l, w are references to the counters used to keep track of the numbers |
| 120 | template <typename Token> |
| 121 | bool operator()(Token const& t, std::size_t& c, std::size_t& w, std::size_t& l) const |
| 122 | { |
| 123 | switch (t.id()) { |
| 124 | case ID_WORD: // matched a word |
| 125 | // since we're using a default token type in this example, every |
| 126 | // token instance contains a `iterator_range<BaseIterator>` as its token |
| 127 | // attribute pointing to the matched character sequence in the input |
| 128 | ++w; c += t.value().size(); |
| 129 | break; |
| 130 | case ID_EOL: // matched a newline character |
| 131 | ++l; ++c; |
| 132 | break; |
| 133 | case ID_CHAR: // matched something else |
| 134 | ++c; |
| 135 | break; |
| 136 | } |
| 137 | return true; // always continue to tokenize |
| 138 | } |
| 139 | }; |
| 140 | //] |
| 141 | |
| 142 | /////////////////////////////////////////////////////////////////////////////// |
| 143 | //[wcf_main |
| 144 | /*` The main function simply loads the given file into memory (as a |
| 145 | `std::string`), instantiates an instance of the token definition template |
| 146 | using the correct iterator type (`word_count_tokens<char const*>`), |
| 147 | and finally calls `lex::tokenize`, passing an instance of the counter function |
| 148 | object. The return value of `lex::tokenize()` will be `true` if the |
| 149 | whole input sequence has been successfully tokenized, and `false` otherwise. |
| 150 | */ |
| 151 | int main(int argc, char* argv[]) |
| 152 | { |
| 153 | // these variables are used to count characters, words and lines |
| 154 | std::size_t c = 0, w = 0, l = 0; |
| 155 | |
| 156 | // read input from the given file |
| 157 | std::string str (read_from_file(infile: 1 == argc ? "word_count.input" : argv[1])); |
| 158 | |
| 159 | // create the token definition instance needed to invoke the lexical analyzer |
| 160 | word_count_tokens<lex::lexertl::lexer<> > word_count_functor; |
| 161 | |
| 162 | // tokenize the given string, the bound functor gets invoked for each of |
| 163 | // the matched tokens |
| 164 | using boost::placeholders::_1; |
| 165 | char const* first = str.c_str(); |
| 166 | char const* last = &first[str.size()]; |
| 167 | bool r = lex::tokenize(first, last, lex: word_count_functor, |
| 168 | f: boost::bind(f: counter(), a: _1, a: boost::ref(t&: c), a: boost::ref(t&: w), a: boost::ref(t&: l))); |
| 169 | |
| 170 | // print results |
| 171 | if (r) { |
| 172 | std::cout << "lines: " << l << ", words: " << w |
| 173 | << ", characters: " << c << "\n" ; |
| 174 | } |
| 175 | else { |
| 176 | std::string rest(first, last); |
| 177 | std::cout << "Lexical analysis failed\n" << "stopped at: \"" |
| 178 | << rest << "\"\n" ; |
| 179 | } |
| 180 | return 0; |
| 181 | } |
| 182 | //] |
| 183 | |
| 184 | |