lib.rs source code [crates/pest_derive/src/lib.rs]

1	// pest. The Elegant Parser
2	// Copyright (c) 2018 Dragoș Tiselice
3	//
4	// Licensed under the Apache License, Version 2.0
5	// <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
6	// license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7	// option. All files in the project carrying such notice may not be copied,
8	// modified, or distributed except according to those terms.
9	#![doc(
10	html_root_url = "https://docs.rs/pest_derive",
11	html_logo_url = "https://raw.githubusercontent.com/pest-parser/pest/master/pest-logo.svg",
12	html_favicon_url = "https://raw.githubusercontent.com/pest-parser/pest/master/pest-logo.svg"
13	)]
14	#![warn(missing_docs, rust_2018_idioms, unused_qualifications)]
15	//! # pest. The Elegant Parser
16	//!
17	//! pest is a general purpose parser written in Rust with a focus on accessibility, correctness,
18	//! and performance. It uses parsing expression grammars (or [PEG]) as input, which are similar in
19	//! spirit to regular expressions, but which offer the enhanced expressivity needed to parse
20	//! complex languages.
21	//!
22	//! [PEG]: https://en.wikipedia.org/wiki/Parsing_expression_grammar
23	//!
24	//! ## Getting started
25	//!
26	//! The recommended way to start parsing with pest is to read the official [book].
27	//!
28	//! Other helpful resources:
29	//!
30	//! API reference on [docs.rs]*
31	//! play with grammars and share them on our [fiddle]*
32	//! find previous common questions answered or ask questions on [GitHub Discussions]*
33	//! leave feedback, ask questions, or greet us on [Gitter] or [Discord]*
34	//!
35	//! [book]: https://pest.rs/book
36	//! [docs.rs]: https://docs.rs/pest
37	//! [fiddle]: https://pest.rs/#editor
38	//! [Gitter]: https://gitter.im/pest-parser/pest
39	//! [Discord]: https://discord.gg/XEGACtWpT2
40	//! [GitHub Discussions]: https://github.com/pest-parser/pest/discussions
41	//!
42	//!
43	//! ## `.pest` files
44	//!
45	//! Grammar definitions reside in custom `.pest` files located in the `src` directory. Their path is
46	//! relative to `src` and is specified between the `derive` attribute and empty `struct` that
47	//! `Parser` will be derived on.
48	//!
49	//! ```ignore
50	//! #[derive(Parser)]
51	//! #[grammar = "path/to/my_grammar.pest"] // relative to src
52	//! struct MyParser;
53	//! ```
54	//!
55	//! ## Inline grammars
56	//!
57	//! Grammars can also be inlined by using the `#[grammar_inline = "..."]` attribute.
58	//!
59	//! ## Grammar
60	//!
61	//! A grammar is a series of rules separated by whitespace, possibly containing comments.
62	//!
63	//! ### Comments
64	//!
65	//! Comments start with `//` and end at the end of the line.
66	//!
67	//! ```text
68	//! // a comment
69	//! ```
70	//!
71	//! ### Rules
72	//!
73	//! Rules have the following form:
74	//!
75	//! ```ignore
76	//! name = optional_modifier { expression }
77	//! ```
78	//!
79	//! The name of the rule is formed from alphanumeric characters or `_` with the condition that the
80	//! first character is not a digit and is used to create token pairs. When the rule starts being
81	//! parsed, the starting part of the token is being produced, with the ending part being produced
82	//! when the rule finishes parsing.
83	//!
84	//! The following token pair notation `a(b(), c())` denotes the tokens: start `a`, start `b`, end
85	//! `b`, start `c`, end `c`, end `a`.
86	//!
87	//! #### Modifiers
88	//!
89	//! Modifiers are optional and can be one of `_`, `@`, `$`, or `!`. These modifiers change the
90	//! behavior of the rules.
91	//!
92	//! 1. Silent (`_`)
93	//!
94	//! Silent rules do not create token pairs during parsing, nor are they error-reported.
95	//!
96	//! ```ignore
97	//! a = _{ "a" }
98	//! b = { a ~ "b" }
99	//! ```
100	//!
101	//! Parsing `"ab"` produces the token pair `b()`.
102	//!
103	//! 2. Atomic (`@`)
104	//!
105	//! Atomic rules do not accept whitespace or comments within their expressions and have a
106	//! cascading effect on any rule they call. I.e. rules that are not atomic but are called by atomic
107	//! rules behave atomically.
108	//!
109	//! Any rules called by atomic rules do not generate token pairs.
110	//!
111	//! ```ignore
112	//! a = { "a" }
113	//! b = @{ a ~ "b" }
114	//!
115	//! WHITESPACE = _{ " " }
116	//! ```
117	//!
118	//! Parsing `"ab"` produces the token pair `b()`, while `"a b"` produces an error.
119	//!
120	//! 3. Compound-atomic (`$`)
121	//!
122	//! Compound-atomic are identical to atomic rules with the exception that rules called by them are
123	//! not forbidden from generating token pairs.
124	//!
125	//! ```ignore
126	//! a = { "a" }
127	//! b = ${ a ~ "b" }
128	//!
129	//! WHITESPACE = _{ " " }
130	//! ```
131	//!
132	//! Parsing `"ab"` produces the token pairs `b(a())`, while `"a b"` produces an error.
133	//!
134	//! 4. Non-atomic (`!`)
135	//!
136	//! Non-atomic are identical to normal rules with the exception that they stop the cascading effect
137	//! of atomic and compound-atomic rules.
138	//!
139	//! ```ignore
140	//! a = { "a" }
141	//! b = !{ a ~ "b" }
142	//! c = @{ b }
143	//!
144	//! WHITESPACE = _{ " " }
145	//! ```
146	//!
147	//! Parsing both `"ab"` and `"a b"` produce the token pairs `c(a())`.
148	//!
149	//! #### Expressions
150	//!
151	//! Expressions can be either terminals or non-terminals.
152	//!
153	//! 1. Terminals
154	//!
155	//! \| Terminal \| Usage \|
156	//! \|------------\|----------------------------------------------------------------\|
157	//! \| `"a"` \| matches the exact string `"a"` \|
158	//! \| `^"a"` \| matches the exact string `"a"` case insensitively (ASCII only) \|
159	//! \| `'a'..'z'` \| matches one character between `'a'` and `'z'` \|
160	//! \| `a` \| matches rule `a` \|
161	//!
162	//! Strings and characters follow
163	//! [Rust's escape mechanisms](https://doc.rust-lang.org/reference/tokens.html#byte-escapes), while
164	//! identifiers can contain alphanumeric characters and underscores (`_`), as long as they do not
165	//! start with a digit.
166	//!
167	//! 2. Non-terminals
168	//!
169	//! \| Non-terminal \| Usage \|
170	//! \|-----------------------\|------------------------------------------------------------\|
171	//! \| `(e)` \| matches `e` \|
172	//! \| `e1 ~ e2` \| matches the sequence `e1` `e2` \|
173	//! \| <code>e1 \\| e2</code> \| matches either `e1` or `e2` \|
174	//! \| `e` \| matches `e` zero or more times \|*
175	//! \| `e+` \| matches `e` one or more times \|
176	//! \| `e{n}` \| matches `e` exactly `n` times \|
177	//! \| `e{, n}` \| matches `e` at most `n` times \|
178	//! \| `e{n,}` \| matches `e` at least `n` times \|
179	//! \| `e{m, n}` \| matches `e` between `m` and `n` times inclusively \|
180	//! \| `e?` \| optionally matches `e` \|
181	//! \| `&e` \| matches `e` without making progress \|
182	//! \| `!e` \| matches if `e` doesn't match without making progress \|
183	//! \| `PUSH(e)` \| matches `e` and pushes its captured string down the stack \|
184	//!
185	//! where `e`, `e1`, and `e2` are expressions.
186	//!
187	//! Matching is greedy, without backtracking. Note the difference in behavior for
188	//! these two rules in matching identifiers that don't end in an underscore:
189	//!
190	//! ```ignore
191	//! // input: ab_bb_b
192	//!
193	//! identifier = @{ "a" ~ ("b"\|"_")* ~ "b" }
194	//! // matches: a b_bb_b nothing -> error!
195	//!
196	//! identifier = @{ "a" ~ ("_"* ~ "b")* }
197	//! // matches: a b, _bb, _b in three repetitions
198	//! ```
199	//!
200	//! Expressions can modify the stack only if they match the input. For example,
201	//! if `e1` in the compound expression `e1 \| e2` does not match the input, then
202	//! it does not modify the stack, so `e2` sees the stack in the same state as
203	//! `e1` did. Repetitions and optionals (`e`, `e+`, `e{, n}`, `e{n,}`,*
204	//! `e{m,n}`, `e?`) can modify the stack each time `e` matches. The `!e` and `&e`
205	//! expressions are a special case; they never modify the stack.
206	//! Many languages have "keyword" tokens (e.g. if, for, while) as well as general
207	//! tokens (e.g. identifier) that matches any word. In order to match a keyword,
208	//! generally, you may need to restrict that is not immediately followed by another
209	//! letter or digit (otherwise it would be matched as an identifier).
210	//!
211	//! ## Special rules
212	//!
213	//! Special rules can be called within the grammar. They are:
214	//!
215	//! `WHITESPACE` - runs between rules and sub-rules*
216	//! `COMMENT` - runs between rules and sub-rules*
217	//! `ANY` - matches exactly one `char`*
218	//! `SOI` - (start-of-input) matches only when a `Parser` is still at the starting position*
219	//! `EOI` - (end-of-input) matches only when a `Parser` has reached its end*
220	//! `POP` - pops a string from the stack and matches it*
221	//! `POP_ALL` - pops the entire state of the stack and matches it*
222	//! `PEEK` - peeks a string from the stack and matches it*
223	//! `PEEK[a..b]` - peeks part of the stack and matches it*
224	//! `PEEK_ALL` - peeks the entire state of the stack and matches it*
225	//! `DROP` - drops the top of the stack (fails to match if the stack is empty)*
226	//!
227	//! `WHITESPACE` and `COMMENT` should be defined manually if needed. All other rules cannot be
228	//! overridden.
229	//!
230	//! ## `WHITESPACE` and `COMMENT`
231	//!
232	//! When defined, these rules get matched automatically in sequences (`~`) and repetitions
233	//! (``, `+`) between expressions. Atomic rules and those rules called by atomic rules are exempt*
234	//! from this behavior.
235	//!
236	//! These rules should be defined so as to match one whitespace character and one comment only since
237	//! they are run in repetitions.
238	//!
239	//! If both `WHITESPACE` and `COMMENT` are defined, this grammar:
240	//!
241	//! ```ignore
242	//! a = { b ~ c }
243	//! ```
244	//!
245	//! is effectively transformed into this one behind the scenes:
246	//!
247	//! ```ignore
248	//! a = { b ~ WHITESPACE* ~ (COMMENT ~ WHITESPACE) ~ c }
249	//! ```
250	//!
251	//! ## `PUSH`, `POP`, `DROP`, and `PEEK`
252	//!
253	//! `PUSH(e)` simply pushes the captured string of the expression `e` down a stack. This stack can
254	//! then later be used to match grammar based on its content with `POP` and `PEEK`.
255	//!
256	//! `PEEK` always matches the string at the top of stack. So, if the stack contains `["b", "a"]`
257	//! (`"a"` being on top), this grammar:
258	//!
259	//! ```ignore
260	//! a = { PEEK }
261	//! ```
262	//!
263	//! is effectively transformed into at parse time:
264	//!
265	//! ```ignore
266	//! a = { "a" }
267	//! ```
268	//!
269	//! `POP` works the same way with the exception that it pops the string off of the stack if the
270	//! match worked. With the stack from above, if `POP` matches `"a"`, the stack will be mutated
271	//! to `["b"]`.
272	//!
273	//! `DROP` makes it possible to remove the string at the top of the stack
274	//! without matching it. If the stack is nonempty, `DROP` drops the top of the
275	//! stack. If the stack is empty, then `DROP` fails to match.
276	//!
277	//! ### Advanced peeking
278	//!
279	//! `PEEK[start..end]` and `PEEK_ALL` allow to peek deeper into the stack. The syntax works exactly
280	//! like Rust’s exclusive slice syntax. Additionally, negative indices can be used to indicate an
281	//! offset from the top. If the end lies before or at the start, the expression matches (as does
282	//! a `PEEK_ALL` on an empty stack). With the stack `["c", "b", "a"]` (`"a"` on top):
283	//!
284	//! ```ignore
285	//! fill = PUSH("c") ~ PUSH("b") ~ PUSH("a")
286	//! v = { PEEK_ALL } = { "a" ~ "b" ~ "c" } // top to bottom
287	//! w = { PEEK[..] } = { "c" ~ "b" ~ "a" } // bottom to top
288	//! x = { PEEK[`1`..`2`] } = { PEEK[`1`..`-1`] } = { "b" }
289	//! y = { PEEK[..`-2`] } = { PEEK[`0`..`1`] } = { "a" }
290	//! z = { PEEK[`1`..] } = { PEEK[`-2`..`3`] } = { "c" ~ "b" }
291	//! n = { PEEK[`2`..`-2`] } = { PEEK[`2`..`1`] } = { "" }
292	//! ```
293	//!
294	//! For historical reasons, `PEEK_ALL` matches from top to bottom, while `PEEK[start..end]` matches
295	//! from bottom to top. There is currently no syntax to match a slice of the stack top to bottom.
296	//!
297	//! ## `Rule`
298	//!
299	//! All rules defined or used in the grammar populate a generated `enum` called `Rule`. This
300	//! implements `pest`'s `RuleType` and can be used throughout the API.
301	//!
302	//! ## `Built-in rules`
303	//!
304	//! Pest also comes with a number of built-in rules for convenience. They are:
305	//!
306	//! `ASCII_DIGIT` - matches a numeric character from 0..9*
307	//! `ASCII_NONZERO_DIGIT` - matches a numeric character from 1..9*
308	//! `ASCII_BIN_DIGIT` - matches a numeric character from 0..1*
309	//! `ASCII_OCT_DIGIT` - matches a numeric character from 0..7*
310	//! `ASCII_HEX_DIGIT` - matches a numeric character from 0..9 or a..f or A..F*
311	//! `ASCII_ALPHA_LOWER` - matches a character from a..z*
312	//! `ASCII_ALPHA_UPPER` - matches a character from A..Z*
313	//! `ASCII_ALPHA` - matches a character from a..z or A..Z*
314	//! `ASCII_ALPHANUMERIC` - matches a character from a..z or A..Z or 0..9*
315	//! `ASCII` - matches a character from \x00..\x7f*
316	//! `NEWLINE` - matches either "\n" or "\r\n" or "\r"*
317
318	use proc_macro::TokenStream;
319
320	/// The main method that's called by the proc macro
321	/// (a wrapper around `pest_generator::derive_parser`)
322	#[proc_macro_derive(Parser, attributes(grammar, grammar_inline))]
323	pub fn derive_parser(input: TokenStream) -> TokenStream {
324	pest_generator::derive_parser(input.into(), include_grammar:`true`).into()
325	}
326