1 | // pest. The Elegant Parser |
2 | // Copyright (c) 2018 Dragoș Tiselice |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 |
5 | // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT |
6 | // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
7 | // option. All files in the project carrying such notice may not be copied, |
8 | // modified, or distributed except according to those terms. |
9 | #![no_std ] |
10 | #![doc ( |
11 | html_logo_url = "https://raw.githubusercontent.com/pest-parser/pest/master/pest-logo.svg" , |
12 | html_favicon_url = "https://raw.githubusercontent.com/pest-parser/pest/master/pest-logo.svg" |
13 | )] |
14 | #![warn (missing_docs, rust_2018_idioms, unused_qualifications)] |
15 | //! # pest. The Elegant Parser |
16 | //! |
17 | //! pest is a general purpose parser written in Rust with a focus on accessibility, correctness, |
18 | //! and performance. It uses parsing expression grammars (or [PEG]) as input, which are similar in |
19 | //! spirit to regular expressions, but which offer the enhanced expressivity needed to parse |
20 | //! complex languages. |
21 | //! |
22 | //! [PEG]: https://en.wikipedia.org/wiki/Parsing_expression_grammar |
23 | //! |
24 | //! ## Getting started |
25 | //! |
26 | //! The recommended way to start parsing with pest is to read the official [book]. |
27 | //! |
28 | //! Other helpful resources: |
29 | //! |
30 | //! * API reference on [docs.rs] |
31 | //! * play with grammars and share them on our [fiddle] |
32 | //! * find previous common questions answered or ask questions on [GitHub Discussions] |
33 | //! * leave feedback, ask questions, or greet us on [Gitter] or [Discord] |
34 | //! |
35 | //! [book]: https://pest.rs/book |
36 | //! [docs.rs]: https://docs.rs/pest |
37 | //! [fiddle]: https://pest.rs/#editor |
38 | //! [Gitter]: https://gitter.im/pest-parser/pest |
39 | //! [Discord]: https://discord.gg/XEGACtWpT2 |
40 | //! [GitHub Discussions]: https://github.com/pest-parser/pest/discussions |
41 | //! |
42 | //! ## Usage |
43 | //! |
44 | //! The core of pest is the trait [`Parser`], which provides an interface to the parsing |
45 | //! functionality. |
46 | //! |
47 | //! The accompanying crate `pest_derive` can automatically generate a [`Parser`] from a PEG |
48 | //! grammar. Using `pest_derive` is highly encouraged, but it is also possible to implement |
49 | //! [`Parser`] manually if required. |
50 | //! |
51 | //! ## `.pest` files |
52 | //! |
53 | //! Grammar definitions reside in custom `.pest` files located in the crate `src` directory. |
54 | //! Parsers are automatically generated from these files using `#[derive(Parser)]` and a special |
55 | //! `#[grammar = "..."]` attribute on a dummy struct. |
56 | //! |
57 | //! ```ignore |
58 | //! #[derive(Parser)] |
59 | //! #[grammar = "path/to/my_grammar.pest" ] // relative to src |
60 | //! struct MyParser; |
61 | //! ``` |
62 | //! |
63 | //! The syntax of `.pest` files is documented in the [`pest_derive` crate]. |
64 | //! |
65 | //! ## Inline grammars |
66 | //! |
67 | //! Grammars can also be inlined by using the `#[grammar_inline = "..."]` attribute. |
68 | //! |
69 | //! [`Parser`]: trait.Parser.html |
70 | //! [`pest_derive` crate]: https://docs.rs/pest_derive/ |
71 | //! |
72 | //! ## Grammar |
73 | //! |
74 | //! A grammar is a series of rules separated by whitespace, possibly containing comments. |
75 | //! |
76 | //! ### Comments |
77 | //! |
78 | //! Comments start with `//` and end at the end of the line. |
79 | //! |
80 | //! ```text |
81 | //! // a comment |
82 | //! ``` |
83 | //! |
84 | //! ### Rules |
85 | //! |
86 | //! Rules have the following form: |
87 | //! |
88 | //! ```ignore |
89 | //! name = optional_modifier { expression } |
90 | //! ``` |
91 | //! |
92 | //! The name of the rule is formed from alphanumeric characters or `_` with the condition that the |
93 | //! first character is not a digit and is used to create token pairs. When the rule starts being |
94 | //! parsed, the starting part of the token is being produced, with the ending part being produced |
95 | //! when the rule finishes parsing. |
96 | //! |
97 | //! The following token pair notation `a(b(), c())` denotes the tokens: start `a`, start `b`, end |
98 | //! `b`, start `c`, end `c`, end `a`. |
99 | //! |
100 | //! #### Modifiers |
101 | //! |
102 | //! Modifiers are optional and can be one of `_`, `@`, `$`, or `!`. These modifiers change the |
103 | //! behavior of the rules. |
104 | //! |
105 | //! 1. Silent (`_`) |
106 | //! |
107 | //! Silent rules do not create token pairs during parsing, nor are they error-reported. |
108 | //! |
109 | //! ```ignore |
110 | //! a = _{ "a" } |
111 | //! b = { a ~ "b" } |
112 | //! ``` |
113 | //! |
114 | //! Parsing `"ab"` produces the token pair `b()`. |
115 | //! |
116 | //! 2. Atomic (`@`) |
117 | //! |
118 | //! Atomic rules do not accept whitespace or comments within their expressions and have a |
119 | //! cascading effect on any rule they call. I.e. rules that are not atomic but are called by atomic |
120 | //! rules behave atomically. |
121 | //! |
122 | //! Any rules called by atomic rules do not generate token pairs. |
123 | //! |
124 | //! ```ignore |
125 | //! a = { "a" } |
126 | //! b = @{ a ~ "b" } |
127 | //! |
128 | //! WHITESPACE = _{ " " } |
129 | //! ``` |
130 | //! |
131 | //! Parsing `"ab"` produces the token pair `b()`, while `"a b"` produces an error. |
132 | //! |
133 | //! 3. Compound-atomic (`$`) |
134 | //! |
135 | //! Compound-atomic are identical to atomic rules with the exception that rules called by them are |
136 | //! not forbidden from generating token pairs. |
137 | //! |
138 | //! ```ignore |
139 | //! a = { "a" } |
140 | //! b = ${ a ~ "b" } |
141 | //! |
142 | //! WHITESPACE = _{ " " } |
143 | //! ``` |
144 | //! |
145 | //! Parsing `"ab"` produces the token pairs `b(a())`, while `"a b"` produces an error. |
146 | //! |
147 | //! 4. Non-atomic (`!`) |
148 | //! |
149 | //! Non-atomic are identical to normal rules with the exception that they stop the cascading effect |
150 | //! of atomic and compound-atomic rules. |
151 | //! |
152 | //! ```ignore |
153 | //! a = { "a" } |
154 | //! b = !{ a ~ "b" } |
155 | //! c = @{ b } |
156 | //! |
157 | //! WHITESPACE = _{ " " } |
158 | //! ``` |
159 | //! |
160 | //! Parsing both `"ab"` and `"a b"` produce the token pairs `c(a())`. |
161 | //! |
162 | //! #### Expressions |
163 | //! |
164 | //! Expressions can be either terminals or non-terminals. |
165 | //! |
166 | //! 1. Terminals |
167 | //! |
168 | //! | Terminal | Usage | |
169 | //! |------------|----------------------------------------------------------------| |
170 | //! | `"a"` | matches the exact string `"a"` | |
171 | //! | `^"a"` | matches the exact string `"a"` case insensitively (ASCII only) | |
172 | //! | `'a'..'z'` | matches one character between `'a'` and `'z'` | |
173 | //! | `a` | matches rule `a` | |
174 | //! |
175 | //! Strings and characters follow |
176 | //! [Rust's escape mechanisms](https://doc.rust-lang.org/reference/tokens.html#byte-escapes), while |
177 | //! identifiers can contain alphanumeric characters and underscores (`_`), as long as they do not |
178 | //! start with a digit. |
179 | //! |
180 | //! 2. Non-terminals |
181 | //! |
182 | //! | Non-terminal | Usage | |
183 | //! |-----------------------|------------------------------------------------------------| |
184 | //! | `(e)` | matches `e` | |
185 | //! | `e1 ~ e2` | matches the sequence `e1` `e2` | |
186 | //! | <code>e1 \| e2</code> | matches either `e1` or `e2` | |
187 | //! | `e*` | matches `e` zero or more times | |
188 | //! | `e+` | matches `e` one or more times | |
189 | //! | `e{n}` | matches `e` exactly `n` times | |
190 | //! | `e{, n}` | matches `e` at most `n` times | |
191 | //! | `e{n,}` | matches `e` at least `n` times | |
192 | //! | `e{m, n}` | matches `e` between `m` and `n` times inclusively | |
193 | //! | `e?` | optionally matches `e` | |
194 | //! | `&e` | matches `e` without making progress | |
195 | //! | `!e` | matches if `e` doesn't match without making progress | |
196 | //! | `PUSH(e)` | matches `e` and pushes it's captured string down the stack | |
197 | //! |
198 | //! where `e`, `e1`, and `e2` are expressions. |
199 | //! |
200 | //! Matching is greedy, without backtracking. Note the difference in behavior for |
201 | //! these two rules in matching identifiers that don't end in an underscore: |
202 | //! |
203 | //! ```ignore |
204 | //! // input: ab_bb_b |
205 | //! |
206 | //! identifier = @{ "a" ~ ("b" |"_" )* ~ "b" } |
207 | //! // matches: a b_bb_b nothing -> error! |
208 | //! |
209 | //! identifier = @{ "a" ~ ("_" * ~ "b" )* } |
210 | //! // matches: a b, _bb, _b in three repetitions |
211 | //! ``` |
212 | //! |
213 | //! Expressions can modify the stack only if they match the input. For example, |
214 | //! if `e1` in the compound expression `e1 | e2` does not match the input, then |
215 | //! it does not modify the stack, so `e2` sees the stack in the same state as |
216 | //! `e1` did. Repetitions and optionals (`e*`, `e+`, `e{, n}`, `e{n,}`, |
217 | //! `e{m,n}`, `e?`) can modify the stack each time `e` matches. The `!e` and `&e` |
218 | //! expressions are a special case; they never modify the stack. |
219 | //! Many languages have "keyword" tokens (e.g. if, for, while) as well as general |
220 | //! tokens (e.g. identifier) that matches any word. In order to match a keyword, |
221 | //! generally, you may need to restrict that is not immediately followed by another |
222 | //! letter or digit (otherwise it would be matched as an identifier). |
223 | //! |
224 | //! ## Special rules |
225 | //! |
226 | //! Special rules can be called within the grammar. They are: |
227 | //! |
228 | //! * `WHITESPACE` - runs between rules and sub-rules |
229 | //! * `COMMENT` - runs between rules and sub-rules |
230 | //! * `ANY` - matches exactly one `char` |
231 | //! * `SOI` - (start-of-input) matches only when a `Parser` is still at the starting position |
232 | //! * `EOI` - (end-of-input) matches only when a `Parser` has reached its end |
233 | //! * `POP` - pops a string from the stack and matches it |
234 | //! * `POP_ALL` - pops the entire state of the stack and matches it |
235 | //! * `PEEK` - peeks a string from the stack and matches it |
236 | //! * `PEEK[a..b]` - peeks part of the stack and matches it |
237 | //! * `PEEK_ALL` - peeks the entire state of the stack and matches it |
238 | //! * `DROP` - drops the top of the stack (fails to match if the stack is empty) |
239 | //! |
240 | //! `WHITESPACE` and `COMMENT` should be defined manually if needed. All other rules cannot be |
241 | //! overridden. |
242 | //! |
243 | //! ## `WHITESPACE` and `COMMENT` |
244 | //! |
245 | //! When defined, these rules get matched automatically in sequences (`~`) and repetitions |
246 | //! (`*`, `+`) between expressions. Atomic rules and those rules called by atomic rules are exempt |
247 | //! from this behavior. |
248 | //! |
249 | //! These rules should be defined so as to match one whitespace character and one comment only since |
250 | //! they are run in repetitions. |
251 | //! |
252 | //! If both `WHITESPACE` and `COMMENT` are defined, this grammar: |
253 | //! |
254 | //! ```ignore |
255 | //! a = { b ~ c } |
256 | //! ``` |
257 | //! |
258 | //! is effectively transformed into this one behind the scenes: |
259 | //! |
260 | //! ```ignore |
261 | //! a = { b ~ WHITESPACE* ~ (COMMENT ~ WHITESPACE*)* ~ c } |
262 | //! ``` |
263 | //! |
264 | //! ## `PUSH`, `POP`, `DROP`, and `PEEK` |
265 | //! |
266 | //! `PUSH(e)` simply pushes the captured string of the expression `e` down a stack. This stack can |
267 | //! then later be used to match grammar based on its content with `POP` and `PEEK`. |
268 | //! |
269 | //! `PEEK` always matches the string at the top of stack. So, if the stack contains `["b", "a"]` |
270 | //! (`"a"` being on top), this grammar: |
271 | //! |
272 | //! ```ignore |
273 | //! a = { PEEK } |
274 | //! ``` |
275 | //! |
276 | //! is effectively transformed into at parse time: |
277 | //! |
278 | //! ```ignore |
279 | //! a = { "a" } |
280 | //! ``` |
281 | //! |
282 | //! `POP` works the same way with the exception that it pops the string off of the stack if the |
283 | //! match worked. With the stack from above, if `POP` matches `"a"`, the stack will be mutated |
284 | //! to `["b"]`. |
285 | //! |
286 | //! `DROP` makes it possible to remove the string at the top of the stack |
287 | //! without matching it. If the stack is nonempty, `DROP` drops the top of the |
288 | //! stack. If the stack is empty, then `DROP` fails to match. |
289 | //! |
290 | //! ### Advanced peeking |
291 | //! |
292 | //! `PEEK[start..end]` and `PEEK_ALL` allow to peek deeper into the stack. The syntax works exactly |
293 | //! like Rust’s exclusive slice syntax. Additionally, negative indices can be used to indicate an |
294 | //! offset from the top. If the end lies before or at the start, the expression matches (as does |
295 | //! a `PEEK_ALL` on an empty stack). With the stack `["c", "b", "a"]` (`"a"` on top): |
296 | //! |
297 | //! ```ignore |
298 | //! fill = PUSH("c" ) ~ PUSH("b" ) ~ PUSH("a" ) |
299 | //! v = { PEEK_ALL } = { "a" ~ "b" ~ "c" } // top to bottom |
300 | //! w = { PEEK[..] } = { "c" ~ "b" ~ "a" } // bottom to top |
301 | //! x = { PEEK[1..2] } = { PEEK[1..-1] } = { "b" } |
302 | //! y = { PEEK[..-2] } = { PEEK[0..1] } = { "a" } |
303 | //! z = { PEEK[1..] } = { PEEK[-2..3] } = { "c" ~ "b" } |
304 | //! n = { PEEK[2..-2] } = { PEEK[2..1] } = { "" } |
305 | //! ``` |
306 | //! |
307 | //! For historical reasons, `PEEK_ALL` matches from top to bottom, while `PEEK[start..end]` matches |
308 | //! from bottom to top. There is currently no syntax to match a slice of the stack top to bottom. |
309 | //! |
310 | //! ## `Rule` |
311 | //! |
312 | //! All rules defined or used in the grammar populate a generated `enum` called `Rule`. This |
313 | //! implements `pest`'s `RuleType` and can be used throughout the API. |
314 | //! |
315 | //! ## `Built-in rules` |
316 | //! |
317 | //! Pest also comes with a number of built-in rules for convenience. They are: |
318 | //! |
319 | //! * `ASCII_DIGIT` - matches a numeric character from 0..9 |
320 | //! * `ASCII_NONZERO_DIGIT` - matches a numeric character from 1..9 |
321 | //! * `ASCII_BIN_DIGIT` - matches a numeric character from 0..1 |
322 | //! * `ASCII_OCT_DIGIT` - matches a numeric character from 0..7 |
323 | //! * `ASCII_HEX_DIGIT` - matches a numeric character from 0..9 or a..f or A..F |
324 | //! * `ASCII_ALPHA_LOWER` - matches a character from a..z |
325 | //! * `ASCII_ALPHA_UPPER` - matches a character from A..Z |
326 | //! * `ASCII_ALPHA` - matches a character from a..z or A..Z |
327 | //! * `ASCII_ALPHANUMERIC` - matches a character from a..z or A..Z or 0..9 |
328 | //! * `ASCII` - matches a character from \x00..\x7f |
329 | //! * `NEWLINE` - matches either "\n" or "\r\n" or "\r" |
330 | |
331 | #![doc (html_root_url = "https://docs.rs/pest" )] |
332 | |
333 | extern crate alloc; |
334 | #[cfg (feature = "std" )] |
335 | extern crate std; |
336 | |
337 | pub use crate::parser::Parser; |
338 | pub use crate::parser_state::{ |
339 | set_call_limit, state, Atomicity, Lookahead, MatchDir, ParseResult, ParserState, |
340 | }; |
341 | pub use crate::position::Position; |
342 | pub use crate::span::{Lines, LinesSpan, Span}; |
343 | pub use crate::token::Token; |
344 | use core::fmt::Debug; |
345 | use core::hash::Hash; |
346 | |
347 | pub mod error; |
348 | pub mod iterators; |
349 | mod macros; |
350 | mod parser; |
351 | mod parser_state; |
352 | mod position; |
353 | pub mod pratt_parser; |
354 | #[deprecated ( |
355 | since = "2.4.0" , |
356 | note = "Use `pest::pratt_parser` instead (it is an equivalent which also supports unary prefix/suffix operators). |
357 | While prec_climber is going to be kept in 2.x minor and patch releases, it may be removed in a future major release." |
358 | )] |
359 | pub mod prec_climber; |
360 | mod span; |
361 | mod stack; |
362 | mod token; |
363 | |
364 | #[doc (hidden)] |
365 | pub mod unicode; |
366 | |
367 | /// A trait which parser rules must implement. |
368 | /// |
369 | /// This trait is set up so that any struct that implements all of its required traits will |
370 | /// automatically implement this trait as well. |
371 | /// |
372 | /// This is essentially a [trait alias](https://github.com/rust-lang/rfcs/pull/1733). When trait |
373 | /// aliases are implemented, this may be replaced by one. |
374 | pub trait RuleType: Copy + Debug + Eq + Hash + Ord {} |
375 | |
376 | impl<T: Copy + Debug + Eq + Hash + Ord> RuleType for T {} |
377 | |