1// pest. The Elegant Parser
2// Copyright (c) 2018 Dragoș Tiselice
3//
4// Licensed under the Apache License, Version 2.0
5// <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
6// license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. All files in the project carrying such notice may not be copied,
8// modified, or distributed except according to those terms.
9#![doc(
10 html_root_url = "https://docs.rs/pest_derive",
11 html_logo_url = "https://raw.githubusercontent.com/pest-parser/pest/master/pest-logo.svg",
12 html_favicon_url = "https://raw.githubusercontent.com/pest-parser/pest/master/pest-logo.svg"
13)]
14#![warn(missing_docs, rust_2018_idioms, unused_qualifications)]
15//! # pest. The Elegant Parser
16//!
17//! pest is a general purpose parser written in Rust with a focus on accessibility, correctness,
18//! and performance. It uses parsing expression grammars (or [PEG]) as input, which are similar in
19//! spirit to regular expressions, but which offer the enhanced expressivity needed to parse
20//! complex languages.
21//!
22//! [PEG]: https://en.wikipedia.org/wiki/Parsing_expression_grammar
23//!
24//! ## Getting started
25//!
26//! The recommended way to start parsing with pest is to read the official [book].
27//!
28//! Other helpful resources:
29//!
30//! * API reference on [docs.rs]
31//! * play with grammars and share them on our [fiddle]
32//! * find previous common questions answered or ask questions on [GitHub Discussions]
33//! * leave feedback, ask questions, or greet us on [Gitter] or [Discord]
34//!
35//! [book]: https://pest.rs/book
36//! [docs.rs]: https://docs.rs/pest
37//! [fiddle]: https://pest.rs/#editor
38//! [Gitter]: https://gitter.im/pest-parser/pest
39//! [Discord]: https://discord.gg/XEGACtWpT2
40//! [GitHub Discussions]: https://github.com/pest-parser/pest/discussions
41//!
42//!
43//! ## `.pest` files
44//!
45//! Grammar definitions reside in custom `.pest` files located in the `src` directory. Their path is
46//! relative to `src` and is specified between the `derive` attribute and empty `struct` that
47//! `Parser` will be derived on.
48//!
49//! ```ignore
50//! #[derive(Parser)]
51//! #[grammar = "path/to/my_grammar.pest"] // relative to src
52//! struct MyParser;
53//! ```
54//!
55//! ## Inline grammars
56//!
57//! Grammars can also be inlined by using the `#[grammar_inline = "..."]` attribute.
58//!
59//! ## Grammar
60//!
61//! A grammar is a series of rules separated by whitespace, possibly containing comments.
62//!
63//! ### Comments
64//!
65//! Comments start with `//` and end at the end of the line.
66//!
67//! ```text
68//! // a comment
69//! ```
70//!
71//! ### Rules
72//!
73//! Rules have the following form:
74//!
75//! ```ignore
76//! name = optional_modifier { expression }
77//! ```
78//!
79//! The name of the rule is formed from alphanumeric characters or `_` with the condition that the
80//! first character is not a digit and is used to create token pairs. When the rule starts being
81//! parsed, the starting part of the token is being produced, with the ending part being produced
82//! when the rule finishes parsing.
83//!
84//! The following token pair notation `a(b(), c())` denotes the tokens: start `a`, start `b`, end
85//! `b`, start `c`, end `c`, end `a`.
86//!
87//! #### Modifiers
88//!
89//! Modifiers are optional and can be one of `_`, `@`, `$`, or `!`. These modifiers change the
90//! behavior of the rules.
91//!
92//! 1. Silent (`_`)
93//!
94//! Silent rules do not create token pairs during parsing, nor are they error-reported.
95//!
96//! ```ignore
97//! a = _{ "a" }
98//! b = { a ~ "b" }
99//! ```
100//!
101//! Parsing `"ab"` produces the token pair `b()`.
102//!
103//! 2. Atomic (`@`)
104//!
105//! Atomic rules do not accept whitespace or comments within their expressions and have a
106//! cascading effect on any rule they call. I.e. rules that are not atomic but are called by atomic
107//! rules behave atomically.
108//!
109//! Any rules called by atomic rules do not generate token pairs.
110//!
111//! ```ignore
112//! a = { "a" }
113//! b = @{ a ~ "b" }
114//!
115//! WHITESPACE = _{ " " }
116//! ```
117//!
118//! Parsing `"ab"` produces the token pair `b()`, while `"a b"` produces an error.
119//!
120//! 3. Compound-atomic (`$`)
121//!
122//! Compound-atomic are identical to atomic rules with the exception that rules called by them are
123//! not forbidden from generating token pairs.
124//!
125//! ```ignore
126//! a = { "a" }
127//! b = ${ a ~ "b" }
128//!
129//! WHITESPACE = _{ " " }
130//! ```
131//!
132//! Parsing `"ab"` produces the token pairs `b(a())`, while `"a b"` produces an error.
133//!
134//! 4. Non-atomic (`!`)
135//!
136//! Non-atomic are identical to normal rules with the exception that they stop the cascading effect
137//! of atomic and compound-atomic rules.
138//!
139//! ```ignore
140//! a = { "a" }
141//! b = !{ a ~ "b" }
142//! c = @{ b }
143//!
144//! WHITESPACE = _{ " " }
145//! ```
146//!
147//! Parsing both `"ab"` and `"a b"` produce the token pairs `c(a())`.
148//!
149//! #### Expressions
150//!
151//! Expressions can be either terminals or non-terminals.
152//!
153//! 1. Terminals
154//!
155//! | Terminal | Usage |
156//! |------------|----------------------------------------------------------------|
157//! | `"a"` | matches the exact string `"a"` |
158//! | `^"a"` | matches the exact string `"a"` case insensitively (ASCII only) |
159//! | `'a'..'z'` | matches one character between `'a'` and `'z'` |
160//! | `a` | matches rule `a` |
161//!
162//! Strings and characters follow
163//! [Rust's escape mechanisms](https://doc.rust-lang.org/reference/tokens.html#byte-escapes), while
164//! identifiers can contain alphanumeric characters and underscores (`_`), as long as they do not
165//! start with a digit.
166//!
167//! 2. Non-terminals
168//!
169//! | Non-terminal | Usage |
170//! |-----------------------|------------------------------------------------------------|
171//! | `(e)` | matches `e` |
172//! | `e1 ~ e2` | matches the sequence `e1` `e2` |
173//! | <code>e1 \| e2</code> | matches either `e1` or `e2` |
174//! | `e*` | matches `e` zero or more times |
175//! | `e+` | matches `e` one or more times |
176//! | `e{n}` | matches `e` exactly `n` times |
177//! | `e{, n}` | matches `e` at most `n` times |
178//! | `e{n,}` | matches `e` at least `n` times |
179//! | `e{m, n}` | matches `e` between `m` and `n` times inclusively |
180//! | `e?` | optionally matches `e` |
181//! | `&e` | matches `e` without making progress |
182//! | `!e` | matches if `e` doesn't match without making progress |
183//! | `PUSH(e)` | matches `e` and pushes it's captured string down the stack |
184//!
185//! where `e`, `e1`, and `e2` are expressions.
186//!
187//! Matching is greedy, without backtracking. Note the difference in behavior for
188//! these two rules in matching identifiers that don't end in an underscore:
189//!
190//! ```ignore
191//! // input: ab_bb_b
192//!
193//! identifier = @{ "a" ~ ("b"|"_")* ~ "b" }
194//! // matches: a b_bb_b nothing -> error!
195//!
196//! identifier = @{ "a" ~ ("_"* ~ "b")* }
197//! // matches: a b, _bb, _b in three repetitions
198//! ```
199//!
200//! Expressions can modify the stack only if they match the input. For example,
201//! if `e1` in the compound expression `e1 | e2` does not match the input, then
202//! it does not modify the stack, so `e2` sees the stack in the same state as
203//! `e1` did. Repetitions and optionals (`e*`, `e+`, `e{, n}`, `e{n,}`,
204//! `e{m,n}`, `e?`) can modify the stack each time `e` matches. The `!e` and `&e`
205//! expressions are a special case; they never modify the stack.
206//! Many languages have "keyword" tokens (e.g. if, for, while) as well as general
207//! tokens (e.g. identifier) that matches any word. In order to match a keyword,
208//! generally, you may need to restrict that is not immediately followed by another
209//! letter or digit (otherwise it would be matched as an identifier).
210//!
211//! ## Special rules
212//!
213//! Special rules can be called within the grammar. They are:
214//!
215//! * `WHITESPACE` - runs between rules and sub-rules
216//! * `COMMENT` - runs between rules and sub-rules
217//! * `ANY` - matches exactly one `char`
218//! * `SOI` - (start-of-input) matches only when a `Parser` is still at the starting position
219//! * `EOI` - (end-of-input) matches only when a `Parser` has reached its end
220//! * `POP` - pops a string from the stack and matches it
221//! * `POP_ALL` - pops the entire state of the stack and matches it
222//! * `PEEK` - peeks a string from the stack and matches it
223//! * `PEEK[a..b]` - peeks part of the stack and matches it
224//! * `PEEK_ALL` - peeks the entire state of the stack and matches it
225//! * `DROP` - drops the top of the stack (fails to match if the stack is empty)
226//!
227//! `WHITESPACE` and `COMMENT` should be defined manually if needed. All other rules cannot be
228//! overridden.
229//!
230//! ## `WHITESPACE` and `COMMENT`
231//!
232//! When defined, these rules get matched automatically in sequences (`~`) and repetitions
233//! (`*`, `+`) between expressions. Atomic rules and those rules called by atomic rules are exempt
234//! from this behavior.
235//!
236//! These rules should be defined so as to match one whitespace character and one comment only since
237//! they are run in repetitions.
238//!
239//! If both `WHITESPACE` and `COMMENT` are defined, this grammar:
240//!
241//! ```ignore
242//! a = { b ~ c }
243//! ```
244//!
245//! is effectively transformed into this one behind the scenes:
246//!
247//! ```ignore
248//! a = { b ~ WHITESPACE* ~ (COMMENT ~ WHITESPACE*)* ~ c }
249//! ```
250//!
251//! ## `PUSH`, `POP`, `DROP`, and `PEEK`
252//!
253//! `PUSH(e)` simply pushes the captured string of the expression `e` down a stack. This stack can
254//! then later be used to match grammar based on its content with `POP` and `PEEK`.
255//!
256//! `PEEK` always matches the string at the top of stack. So, if the stack contains `["b", "a"]`
257//! (`"a"` being on top), this grammar:
258//!
259//! ```ignore
260//! a = { PEEK }
261//! ```
262//!
263//! is effectively transformed into at parse time:
264//!
265//! ```ignore
266//! a = { "a" }
267//! ```
268//!
269//! `POP` works the same way with the exception that it pops the string off of the stack if the
270//! match worked. With the stack from above, if `POP` matches `"a"`, the stack will be mutated
271//! to `["b"]`.
272//!
273//! `DROP` makes it possible to remove the string at the top of the stack
274//! without matching it. If the stack is nonempty, `DROP` drops the top of the
275//! stack. If the stack is empty, then `DROP` fails to match.
276//!
277//! ### Advanced peeking
278//!
279//! `PEEK[start..end]` and `PEEK_ALL` allow to peek deeper into the stack. The syntax works exactly
280//! like Rust’s exclusive slice syntax. Additionally, negative indices can be used to indicate an
281//! offset from the top. If the end lies before or at the start, the expression matches (as does
282//! a `PEEK_ALL` on an empty stack). With the stack `["c", "b", "a"]` (`"a"` on top):
283//!
284//! ```ignore
285//! fill = PUSH("c") ~ PUSH("b") ~ PUSH("a")
286//! v = { PEEK_ALL } = { "a" ~ "b" ~ "c" } // top to bottom
287//! w = { PEEK[..] } = { "c" ~ "b" ~ "a" } // bottom to top
288//! x = { PEEK[1..2] } = { PEEK[1..-1] } = { "b" }
289//! y = { PEEK[..-2] } = { PEEK[0..1] } = { "a" }
290//! z = { PEEK[1..] } = { PEEK[-2..3] } = { "c" ~ "b" }
291//! n = { PEEK[2..-2] } = { PEEK[2..1] } = { "" }
292//! ```
293//!
294//! For historical reasons, `PEEK_ALL` matches from top to bottom, while `PEEK[start..end]` matches
295//! from bottom to top. There is currently no syntax to match a slice of the stack top to bottom.
296//!
297//! ## `Rule`
298//!
299//! All rules defined or used in the grammar populate a generated `enum` called `Rule`. This
300//! implements `pest`'s `RuleType` and can be used throughout the API.
301//!
302//! ## `Built-in rules`
303//!
304//! Pest also comes with a number of built-in rules for convenience. They are:
305//!
306//! * `ASCII_DIGIT` - matches a numeric character from 0..9
307//! * `ASCII_NONZERO_DIGIT` - matches a numeric character from 1..9
308//! * `ASCII_BIN_DIGIT` - matches a numeric character from 0..1
309//! * `ASCII_OCT_DIGIT` - matches a numeric character from 0..7
310//! * `ASCII_HEX_DIGIT` - matches a numeric character from 0..9 or a..f or A..F
311//! * `ASCII_ALPHA_LOWER` - matches a character from a..z
312//! * `ASCII_ALPHA_UPPER` - matches a character from A..Z
313//! * `ASCII_ALPHA` - matches a character from a..z or A..Z
314//! * `ASCII_ALPHANUMERIC` - matches a character from a..z or A..Z or 0..9
315//! * `ASCII` - matches a character from \x00..\x7f
316//! * `NEWLINE` - matches either "\n" or "\r\n" or "\r"
317
318use proc_macro::TokenStream;
319
320/// The main method that's called by the proc macro
321/// (a wrapper around `pest_generator::derive_parser`)
322#[proc_macro_derive(Parser, attributes(grammar, grammar_inline))]
323pub fn derive_parser(input: TokenStream) -> TokenStream {
324 pest_generator::derive_parser(input:input.into(), include_grammar:true).into()
325}
326