| 1 | // pest. The Elegant Parser |
| 2 | // Copyright (c) 2018 Dragoș Tiselice |
| 3 | // |
| 4 | // Licensed under the Apache License, Version 2.0 |
| 5 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT |
| 6 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| 7 | // option. All files in the project carrying such notice may not be copied, |
| 8 | // modified, or distributed except according to those terms. |
| 9 | #![doc ( |
| 10 | html_root_url = "https://docs.rs/pest_derive" , |
| 11 | html_logo_url = "https://raw.githubusercontent.com/pest-parser/pest/master/pest-logo.svg" , |
| 12 | html_favicon_url = "https://raw.githubusercontent.com/pest-parser/pest/master/pest-logo.svg" |
| 13 | )] |
| 14 | #![warn (missing_docs, rust_2018_idioms, unused_qualifications)] |
| 15 | //! # pest. The Elegant Parser |
| 16 | //! |
| 17 | //! pest is a general purpose parser written in Rust with a focus on accessibility, correctness, |
| 18 | //! and performance. It uses parsing expression grammars (or [PEG]) as input, which are similar in |
| 19 | //! spirit to regular expressions, but which offer the enhanced expressivity needed to parse |
| 20 | //! complex languages. |
| 21 | //! |
| 22 | //! [PEG]: https://en.wikipedia.org/wiki/Parsing_expression_grammar |
| 23 | //! |
| 24 | //! ## Getting started |
| 25 | //! |
| 26 | //! The recommended way to start parsing with pest is to read the official [book]. |
| 27 | //! |
| 28 | //! Other helpful resources: |
| 29 | //! |
| 30 | //! * API reference on [docs.rs] |
| 31 | //! * play with grammars and share them on our [fiddle] |
| 32 | //! * find previous common questions answered or ask questions on [GitHub Discussions] |
| 33 | //! * leave feedback, ask questions, or greet us on [Gitter] or [Discord] |
| 34 | //! |
| 35 | //! [book]: https://pest.rs/book |
| 36 | //! [docs.rs]: https://docs.rs/pest |
| 37 | //! [fiddle]: https://pest.rs/#editor |
| 38 | //! [Gitter]: https://gitter.im/pest-parser/pest |
| 39 | //! [Discord]: https://discord.gg/XEGACtWpT2 |
| 40 | //! [GitHub Discussions]: https://github.com/pest-parser/pest/discussions |
| 41 | //! |
| 42 | //! |
| 43 | //! ## `.pest` files |
| 44 | //! |
| 45 | //! Grammar definitions reside in custom `.pest` files located in the `src` directory. Their path is |
| 46 | //! relative to `src` and is specified between the `derive` attribute and empty `struct` that |
| 47 | //! `Parser` will be derived on. |
| 48 | //! |
| 49 | //! ```ignore |
| 50 | //! #[derive(Parser)] |
| 51 | //! #[grammar = "path/to/my_grammar.pest" ] // relative to src |
| 52 | //! struct MyParser; |
| 53 | //! ``` |
| 54 | //! |
| 55 | //! ## Inline grammars |
| 56 | //! |
| 57 | //! Grammars can also be inlined by using the `#[grammar_inline = "..."]` attribute. |
| 58 | //! |
| 59 | //! ## Grammar |
| 60 | //! |
| 61 | //! A grammar is a series of rules separated by whitespace, possibly containing comments. |
| 62 | //! |
| 63 | //! ### Comments |
| 64 | //! |
| 65 | //! Comments start with `//` and end at the end of the line. |
| 66 | //! |
| 67 | //! ```text |
| 68 | //! // a comment |
| 69 | //! ``` |
| 70 | //! |
| 71 | //! ### Rules |
| 72 | //! |
| 73 | //! Rules have the following form: |
| 74 | //! |
| 75 | //! ```ignore |
| 76 | //! name = optional_modifier { expression } |
| 77 | //! ``` |
| 78 | //! |
| 79 | //! The name of the rule is formed from alphanumeric characters or `_` with the condition that the |
| 80 | //! first character is not a digit and is used to create token pairs. When the rule starts being |
| 81 | //! parsed, the starting part of the token is being produced, with the ending part being produced |
| 82 | //! when the rule finishes parsing. |
| 83 | //! |
| 84 | //! The following token pair notation `a(b(), c())` denotes the tokens: start `a`, start `b`, end |
| 85 | //! `b`, start `c`, end `c`, end `a`. |
| 86 | //! |
| 87 | //! #### Modifiers |
| 88 | //! |
| 89 | //! Modifiers are optional and can be one of `_`, `@`, `$`, or `!`. These modifiers change the |
| 90 | //! behavior of the rules. |
| 91 | //! |
| 92 | //! 1. Silent (`_`) |
| 93 | //! |
| 94 | //! Silent rules do not create token pairs during parsing, nor are they error-reported. |
| 95 | //! |
| 96 | //! ```ignore |
| 97 | //! a = _{ "a" } |
| 98 | //! b = { a ~ "b" } |
| 99 | //! ``` |
| 100 | //! |
| 101 | //! Parsing `"ab"` produces the token pair `b()`. |
| 102 | //! |
| 103 | //! 2. Atomic (`@`) |
| 104 | //! |
| 105 | //! Atomic rules do not accept whitespace or comments within their expressions and have a |
| 106 | //! cascading effect on any rule they call. I.e. rules that are not atomic but are called by atomic |
| 107 | //! rules behave atomically. |
| 108 | //! |
| 109 | //! Any rules called by atomic rules do not generate token pairs. |
| 110 | //! |
| 111 | //! ```ignore |
| 112 | //! a = { "a" } |
| 113 | //! b = @{ a ~ "b" } |
| 114 | //! |
| 115 | //! WHITESPACE = _{ " " } |
| 116 | //! ``` |
| 117 | //! |
| 118 | //! Parsing `"ab"` produces the token pair `b()`, while `"a b"` produces an error. |
| 119 | //! |
| 120 | //! 3. Compound-atomic (`$`) |
| 121 | //! |
| 122 | //! Compound-atomic are identical to atomic rules with the exception that rules called by them are |
| 123 | //! not forbidden from generating token pairs. |
| 124 | //! |
| 125 | //! ```ignore |
| 126 | //! a = { "a" } |
| 127 | //! b = ${ a ~ "b" } |
| 128 | //! |
| 129 | //! WHITESPACE = _{ " " } |
| 130 | //! ``` |
| 131 | //! |
| 132 | //! Parsing `"ab"` produces the token pairs `b(a())`, while `"a b"` produces an error. |
| 133 | //! |
| 134 | //! 4. Non-atomic (`!`) |
| 135 | //! |
| 136 | //! Non-atomic are identical to normal rules with the exception that they stop the cascading effect |
| 137 | //! of atomic and compound-atomic rules. |
| 138 | //! |
| 139 | //! ```ignore |
| 140 | //! a = { "a" } |
| 141 | //! b = !{ a ~ "b" } |
| 142 | //! c = @{ b } |
| 143 | //! |
| 144 | //! WHITESPACE = _{ " " } |
| 145 | //! ``` |
| 146 | //! |
| 147 | //! Parsing both `"ab"` and `"a b"` produce the token pairs `c(a())`. |
| 148 | //! |
| 149 | //! #### Expressions |
| 150 | //! |
| 151 | //! Expressions can be either terminals or non-terminals. |
| 152 | //! |
| 153 | //! 1. Terminals |
| 154 | //! |
| 155 | //! | Terminal | Usage | |
| 156 | //! |------------|----------------------------------------------------------------| |
| 157 | //! | `"a"` | matches the exact string `"a"` | |
| 158 | //! | `^"a"` | matches the exact string `"a"` case insensitively (ASCII only) | |
| 159 | //! | `'a'..'z'` | matches one character between `'a'` and `'z'` | |
| 160 | //! | `a` | matches rule `a` | |
| 161 | //! |
| 162 | //! Strings and characters follow |
| 163 | //! [Rust's escape mechanisms](https://doc.rust-lang.org/reference/tokens.html#byte-escapes), while |
| 164 | //! identifiers can contain alphanumeric characters and underscores (`_`), as long as they do not |
| 165 | //! start with a digit. |
| 166 | //! |
| 167 | //! 2. Non-terminals |
| 168 | //! |
| 169 | //! | Non-terminal | Usage | |
| 170 | //! |-----------------------|------------------------------------------------------------| |
| 171 | //! | `(e)` | matches `e` | |
| 172 | //! | `e1 ~ e2` | matches the sequence `e1` `e2` | |
| 173 | //! | <code>e1 \| e2</code> | matches either `e1` or `e2` | |
| 174 | //! | `e*` | matches `e` zero or more times | |
| 175 | //! | `e+` | matches `e` one or more times | |
| 176 | //! | `e{n}` | matches `e` exactly `n` times | |
| 177 | //! | `e{, n}` | matches `e` at most `n` times | |
| 178 | //! | `e{n,}` | matches `e` at least `n` times | |
| 179 | //! | `e{m, n}` | matches `e` between `m` and `n` times inclusively | |
| 180 | //! | `e?` | optionally matches `e` | |
| 181 | //! | `&e` | matches `e` without making progress | |
| 182 | //! | `!e` | matches if `e` doesn't match without making progress | |
| 183 | //! | `PUSH(e)` | matches `e` and pushes its captured string down the stack | |
| 184 | //! |
| 185 | //! where `e`, `e1`, and `e2` are expressions. |
| 186 | //! |
| 187 | //! Matching is greedy, without backtracking. Note the difference in behavior for |
| 188 | //! these two rules in matching identifiers that don't end in an underscore: |
| 189 | //! |
| 190 | //! ```ignore |
| 191 | //! // input: ab_bb_b |
| 192 | //! |
| 193 | //! identifier = @{ "a" ~ ("b" |"_" )* ~ "b" } |
| 194 | //! // matches: a b_bb_b nothing -> error! |
| 195 | //! |
| 196 | //! identifier = @{ "a" ~ ("_" * ~ "b" )* } |
| 197 | //! // matches: a b, _bb, _b in three repetitions |
| 198 | //! ``` |
| 199 | //! |
| 200 | //! Expressions can modify the stack only if they match the input. For example, |
| 201 | //! if `e1` in the compound expression `e1 | e2` does not match the input, then |
| 202 | //! it does not modify the stack, so `e2` sees the stack in the same state as |
| 203 | //! `e1` did. Repetitions and optionals (`e*`, `e+`, `e{, n}`, `e{n,}`, |
| 204 | //! `e{m,n}`, `e?`) can modify the stack each time `e` matches. The `!e` and `&e` |
| 205 | //! expressions are a special case; they never modify the stack. |
| 206 | //! Many languages have "keyword" tokens (e.g. if, for, while) as well as general |
| 207 | //! tokens (e.g. identifier) that matches any word. In order to match a keyword, |
| 208 | //! generally, you may need to restrict that is not immediately followed by another |
| 209 | //! letter or digit (otherwise it would be matched as an identifier). |
| 210 | //! |
| 211 | //! ## Special rules |
| 212 | //! |
| 213 | //! Special rules can be called within the grammar. They are: |
| 214 | //! |
| 215 | //! * `WHITESPACE` - runs between rules and sub-rules |
| 216 | //! * `COMMENT` - runs between rules and sub-rules |
| 217 | //! * `ANY` - matches exactly one `char` |
| 218 | //! * `SOI` - (start-of-input) matches only when a `Parser` is still at the starting position |
| 219 | //! * `EOI` - (end-of-input) matches only when a `Parser` has reached its end |
| 220 | //! * `POP` - pops a string from the stack and matches it |
| 221 | //! * `POP_ALL` - pops the entire state of the stack and matches it |
| 222 | //! * `PEEK` - peeks a string from the stack and matches it |
| 223 | //! * `PEEK[a..b]` - peeks part of the stack and matches it |
| 224 | //! * `PEEK_ALL` - peeks the entire state of the stack and matches it |
| 225 | //! * `DROP` - drops the top of the stack (fails to match if the stack is empty) |
| 226 | //! |
| 227 | //! `WHITESPACE` and `COMMENT` should be defined manually if needed. All other rules cannot be |
| 228 | //! overridden. |
| 229 | //! |
| 230 | //! ## `WHITESPACE` and `COMMENT` |
| 231 | //! |
| 232 | //! When defined, these rules get matched automatically in sequences (`~`) and repetitions |
| 233 | //! (`*`, `+`) between expressions. Atomic rules and those rules called by atomic rules are exempt |
| 234 | //! from this behavior. |
| 235 | //! |
| 236 | //! These rules should be defined so as to match one whitespace character and one comment only since |
| 237 | //! they are run in repetitions. |
| 238 | //! |
| 239 | //! If both `WHITESPACE` and `COMMENT` are defined, this grammar: |
| 240 | //! |
| 241 | //! ```ignore |
| 242 | //! a = { b ~ c } |
| 243 | //! ``` |
| 244 | //! |
| 245 | //! is effectively transformed into this one behind the scenes: |
| 246 | //! |
| 247 | //! ```ignore |
| 248 | //! a = { b ~ WHITESPACE* ~ (COMMENT ~ WHITESPACE*)* ~ c } |
| 249 | //! ``` |
| 250 | //! |
| 251 | //! ## `PUSH`, `POP`, `DROP`, and `PEEK` |
| 252 | //! |
| 253 | //! `PUSH(e)` simply pushes the captured string of the expression `e` down a stack. This stack can |
| 254 | //! then later be used to match grammar based on its content with `POP` and `PEEK`. |
| 255 | //! |
| 256 | //! `PEEK` always matches the string at the top of stack. So, if the stack contains `["b", "a"]` |
| 257 | //! (`"a"` being on top), this grammar: |
| 258 | //! |
| 259 | //! ```ignore |
| 260 | //! a = { PEEK } |
| 261 | //! ``` |
| 262 | //! |
| 263 | //! is effectively transformed into at parse time: |
| 264 | //! |
| 265 | //! ```ignore |
| 266 | //! a = { "a" } |
| 267 | //! ``` |
| 268 | //! |
| 269 | //! `POP` works the same way with the exception that it pops the string off of the stack if the |
| 270 | //! match worked. With the stack from above, if `POP` matches `"a"`, the stack will be mutated |
| 271 | //! to `["b"]`. |
| 272 | //! |
| 273 | //! `DROP` makes it possible to remove the string at the top of the stack |
| 274 | //! without matching it. If the stack is nonempty, `DROP` drops the top of the |
| 275 | //! stack. If the stack is empty, then `DROP` fails to match. |
| 276 | //! |
| 277 | //! ### Advanced peeking |
| 278 | //! |
| 279 | //! `PEEK[start..end]` and `PEEK_ALL` allow to peek deeper into the stack. The syntax works exactly |
| 280 | //! like Rust’s exclusive slice syntax. Additionally, negative indices can be used to indicate an |
| 281 | //! offset from the top. If the end lies before or at the start, the expression matches (as does |
| 282 | //! a `PEEK_ALL` on an empty stack). With the stack `["c", "b", "a"]` (`"a"` on top): |
| 283 | //! |
| 284 | //! ```ignore |
| 285 | //! fill = PUSH("c" ) ~ PUSH("b" ) ~ PUSH("a" ) |
| 286 | //! v = { PEEK_ALL } = { "a" ~ "b" ~ "c" } // top to bottom |
| 287 | //! w = { PEEK[..] } = { "c" ~ "b" ~ "a" } // bottom to top |
| 288 | //! x = { PEEK[1..2] } = { PEEK[1..-1] } = { "b" } |
| 289 | //! y = { PEEK[..-2] } = { PEEK[0..1] } = { "a" } |
| 290 | //! z = { PEEK[1..] } = { PEEK[-2..3] } = { "c" ~ "b" } |
| 291 | //! n = { PEEK[2..-2] } = { PEEK[2..1] } = { "" } |
| 292 | //! ``` |
| 293 | //! |
| 294 | //! For historical reasons, `PEEK_ALL` matches from top to bottom, while `PEEK[start..end]` matches |
| 295 | //! from bottom to top. There is currently no syntax to match a slice of the stack top to bottom. |
| 296 | //! |
| 297 | //! ## `Rule` |
| 298 | //! |
| 299 | //! All rules defined or used in the grammar populate a generated `enum` called `Rule`. This |
| 300 | //! implements `pest`'s `RuleType` and can be used throughout the API. |
| 301 | //! |
| 302 | //! ## `Built-in rules` |
| 303 | //! |
| 304 | //! Pest also comes with a number of built-in rules for convenience. They are: |
| 305 | //! |
| 306 | //! * `ASCII_DIGIT` - matches a numeric character from 0..9 |
| 307 | //! * `ASCII_NONZERO_DIGIT` - matches a numeric character from 1..9 |
| 308 | //! * `ASCII_BIN_DIGIT` - matches a numeric character from 0..1 |
| 309 | //! * `ASCII_OCT_DIGIT` - matches a numeric character from 0..7 |
| 310 | //! * `ASCII_HEX_DIGIT` - matches a numeric character from 0..9 or a..f or A..F |
| 311 | //! * `ASCII_ALPHA_LOWER` - matches a character from a..z |
| 312 | //! * `ASCII_ALPHA_UPPER` - matches a character from A..Z |
| 313 | //! * `ASCII_ALPHA` - matches a character from a..z or A..Z |
| 314 | //! * `ASCII_ALPHANUMERIC` - matches a character from a..z or A..Z or 0..9 |
| 315 | //! * `ASCII` - matches a character from \x00..\x7f |
| 316 | //! * `NEWLINE` - matches either "\n" or "\r\n" or "\r" |
| 317 | |
| 318 | use proc_macro::TokenStream; |
| 319 | |
| 320 | /// The main method that's called by the proc macro |
| 321 | /// (a wrapper around `pest_generator::derive_parser`) |
| 322 | #[proc_macro_derive (Parser, attributes(grammar, grammar_inline))] |
| 323 | pub fn derive_parser(input: TokenStream) -> TokenStream { |
| 324 | pest_generator::derive_parser(input.into(), include_grammar:true).into() |
| 325 | } |
| 326 | |