lib.rs source code [crates/pest-2.7.0/src/lib.rs]

1	// pest. The Elegant Parser
2	// Copyright (c) 2018 Dragoș Tiselice
3	//
4	// Licensed under the Apache License, Version 2.0
5	// <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
6	// license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7	// option. All files in the project carrying such notice may not be copied,
8	// modified, or distributed except according to those terms.
9	#![no_std]
10	#![doc(
11	html_logo_url = "https://raw.githubusercontent.com/pest-parser/pest/master/pest-logo.svg",
12	html_favicon_url = "https://raw.githubusercontent.com/pest-parser/pest/master/pest-logo.svg"
13	)]
14	#![warn(missing_docs, rust_2018_idioms, unused_qualifications)]
15	//! # pest. The Elegant Parser
16	//!
17	//! pest is a general purpose parser written in Rust with a focus on accessibility, correctness,
18	//! and performance. It uses parsing expression grammars (or [PEG]) as input, which are similar in
19	//! spirit to regular expressions, but which offer the enhanced expressivity needed to parse
20	//! complex languages.
21	//!
22	//! [PEG]: https://en.wikipedia.org/wiki/Parsing_expression_grammar
23	//!
24	//! ## Getting started
25	//!
26	//! The recommended way to start parsing with pest is to read the official [book].
27	//!
28	//! Other helpful resources:
29	//!
30	//! API reference on [docs.rs]*
31	//! play with grammars and share them on our [fiddle]*
32	//! find previous common questions answered or ask questions on [GitHub Discussions]*
33	//! leave feedback, ask questions, or greet us on [Gitter] or [Discord]*
34	//!
35	//! [book]: https://pest.rs/book
36	//! [docs.rs]: https://docs.rs/pest
37	//! [fiddle]: https://pest.rs/#editor
38	//! [Gitter]: https://gitter.im/pest-parser/pest
39	//! [Discord]: https://discord.gg/XEGACtWpT2
40	//! [GitHub Discussions]: https://github.com/pest-parser/pest/discussions
41	//!
42	//! ## Usage
43	//!
44	//! The core of pest is the trait [`Parser`], which provides an interface to the parsing
45	//! functionality.
46	//!
47	//! The accompanying crate `pest_derive` can automatically generate a [`Parser`] from a PEG
48	//! grammar. Using `pest_derive` is highly encouraged, but it is also possible to implement
49	//! [`Parser`] manually if required.
50	//!
51	//! ## `.pest` files
52	//!
53	//! Grammar definitions reside in custom `.pest` files located in the crate `src` directory.
54	//! Parsers are automatically generated from these files using `#[derive(Parser)]` and a special
55	//! `#[grammar = "..."]` attribute on a dummy struct.
56	//!
57	//! ```ignore
58	//! #[derive(Parser)]
59	//! #[grammar = "path/to/my_grammar.pest"] // relative to src
60	//! struct MyParser;
61	//! ```
62	//!
63	//! The syntax of `.pest` files is documented in the [`pest_derive` crate].
64	//!
65	//! ## Inline grammars
66	//!
67	//! Grammars can also be inlined by using the `#[grammar_inline = "..."]` attribute.
68	//!
69	//! [`Parser`]: trait.Parser.html
70	//! [`pest_derive` crate]: https://docs.rs/pest_derive/
71	//!
72	//! ## Grammar
73	//!
74	//! A grammar is a series of rules separated by whitespace, possibly containing comments.
75	//!
76	//! ### Comments
77	//!
78	//! Comments start with `//` and end at the end of the line.
79	//!
80	//! ```text
81	//! // a comment
82	//! ```
83	//!
84	//! ### Rules
85	//!
86	//! Rules have the following form:
87	//!
88	//! ```ignore
89	//! name = optional_modifier { expression }
90	//! ```
91	//!
92	//! The name of the rule is formed from alphanumeric characters or `_` with the condition that the
93	//! first character is not a digit and is used to create token pairs. When the rule starts being
94	//! parsed, the starting part of the token is being produced, with the ending part being produced
95	//! when the rule finishes parsing.
96	//!
97	//! The following token pair notation `a(b(), c())` denotes the tokens: start `a`, start `b`, end
98	//! `b`, start `c`, end `c`, end `a`.
99	//!
100	//! #### Modifiers
101	//!
102	//! Modifiers are optional and can be one of `_`, `@`, `$`, or `!`. These modifiers change the
103	//! behavior of the rules.
104	//!
105	//! 1. Silent (`_`)
106	//!
107	//! Silent rules do not create token pairs during parsing, nor are they error-reported.
108	//!
109	//! ```ignore
110	//! a = _{ "a" }
111	//! b = { a ~ "b" }
112	//! ```
113	//!
114	//! Parsing `"ab"` produces the token pair `b()`.
115	//!
116	//! 2. Atomic (`@`)
117	//!
118	//! Atomic rules do not accept whitespace or comments within their expressions and have a
119	//! cascading effect on any rule they call. I.e. rules that are not atomic but are called by atomic
120	//! rules behave atomically.
121	//!
122	//! Any rules called by atomic rules do not generate token pairs.
123	//!
124	//! ```ignore
125	//! a = { "a" }
126	//! b = @{ a ~ "b" }
127	//!
128	//! WHITESPACE = _{ " " }
129	//! ```
130	//!
131	//! Parsing `"ab"` produces the token pair `b()`, while `"a b"` produces an error.
132	//!
133	//! 3. Compound-atomic (`$`)
134	//!
135	//! Compound-atomic are identical to atomic rules with the exception that rules called by them are
136	//! not forbidden from generating token pairs.
137	//!
138	//! ```ignore
139	//! a = { "a" }
140	//! b = ${ a ~ "b" }
141	//!
142	//! WHITESPACE = _{ " " }
143	//! ```
144	//!
145	//! Parsing `"ab"` produces the token pairs `b(a())`, while `"a b"` produces an error.
146	//!
147	//! 4. Non-atomic (`!`)
148	//!
149	//! Non-atomic are identical to normal rules with the exception that they stop the cascading effect
150	//! of atomic and compound-atomic rules.
151	//!
152	//! ```ignore
153	//! a = { "a" }
154	//! b = !{ a ~ "b" }
155	//! c = @{ b }
156	//!
157	//! WHITESPACE = _{ " " }
158	//! ```
159	//!
160	//! Parsing both `"ab"` and `"a b"` produce the token pairs `c(a())`.
161	//!
162	//! #### Expressions
163	//!
164	//! Expressions can be either terminals or non-terminals.
165	//!
166	//! 1. Terminals
167	//!
168	//! \| Terminal \| Usage \|
169	//! \|------------\|----------------------------------------------------------------\|
170	//! \| `"a"` \| matches the exact string `"a"` \|
171	//! \| `^"a"` \| matches the exact string `"a"` case insensitively (ASCII only) \|
172	//! \| `'a'..'z'` \| matches one character between `'a'` and `'z'` \|
173	//! \| `a` \| matches rule `a` \|
174	//!
175	//! Strings and characters follow
176	//! [Rust's escape mechanisms](https://doc.rust-lang.org/reference/tokens.html#byte-escapes), while
177	//! identifiers can contain alphanumeric characters and underscores (`_`), as long as they do not
178	//! start with a digit.
179	//!
180	//! 2. Non-terminals
181	//!
182	//! \| Non-terminal \| Usage \|
183	//! \|-----------------------\|------------------------------------------------------------\|
184	//! \| `(e)` \| matches `e` \|
185	//! \| `e1 ~ e2` \| matches the sequence `e1` `e2` \|
186	//! \| <code>e1 \\| e2</code> \| matches either `e1` or `e2` \|
187	//! \| `e` \| matches `e` zero or more times \|*
188	//! \| `e+` \| matches `e` one or more times \|
189	//! \| `e{n}` \| matches `e` exactly `n` times \|
190	//! \| `e{, n}` \| matches `e` at most `n` times \|
191	//! \| `e{n,}` \| matches `e` at least `n` times \|
192	//! \| `e{m, n}` \| matches `e` between `m` and `n` times inclusively \|
193	//! \| `e?` \| optionally matches `e` \|
194	//! \| `&e` \| matches `e` without making progress \|
195	//! \| `!e` \| matches if `e` doesn't match without making progress \|
196	//! \| `PUSH(e)` \| matches `e` and pushes it's captured string down the stack \|
197	//!
198	//! where `e`, `e1`, and `e2` are expressions.
199	//!
200	//! Matching is greedy, without backtracking. Note the difference in behavior for
201	//! these two rules in matching identifiers that don't end in an underscore:
202	//!
203	//! ```ignore
204	//! // input: ab_bb_b
205	//!
206	//! identifier = @{ "a" ~ ("b"\|"_")* ~ "b" }
207	//! // matches: a b_bb_b nothing -> error!
208	//!
209	//! identifier = @{ "a" ~ ("_"* ~ "b")* }
210	//! // matches: a b, _bb, _b in three repetitions
211	//! ```
212	//!
213	//! Expressions can modify the stack only if they match the input. For example,
214	//! if `e1` in the compound expression `e1 \| e2` does not match the input, then
215	//! it does not modify the stack, so `e2` sees the stack in the same state as
216	//! `e1` did. Repetitions and optionals (`e`, `e+`, `e{, n}`, `e{n,}`,*
217	//! `e{m,n}`, `e?`) can modify the stack each time `e` matches. The `!e` and `&e`
218	//! expressions are a special case; they never modify the stack.
219	//! Many languages have "keyword" tokens (e.g. if, for, while) as well as general
220	//! tokens (e.g. identifier) that matches any word. In order to match a keyword,
221	//! generally, you may need to restrict that is not immediately followed by another
222	//! letter or digit (otherwise it would be matched as an identifier).
223	//!
224	//! ## Special rules
225	//!
226	//! Special rules can be called within the grammar. They are:
227	//!
228	//! `WHITESPACE` - runs between rules and sub-rules*
229	//! `COMMENT` - runs between rules and sub-rules*
230	//! `ANY` - matches exactly one `char`*
231	//! `SOI` - (start-of-input) matches only when a `Parser` is still at the starting position*
232	//! `EOI` - (end-of-input) matches only when a `Parser` has reached its end*
233	//! `POP` - pops a string from the stack and matches it*
234	//! `POP_ALL` - pops the entire state of the stack and matches it*
235	//! `PEEK` - peeks a string from the stack and matches it*
236	//! `PEEK[a..b]` - peeks part of the stack and matches it*
237	//! `PEEK_ALL` - peeks the entire state of the stack and matches it*
238	//! `DROP` - drops the top of the stack (fails to match if the stack is empty)*
239	//!
240	//! `WHITESPACE` and `COMMENT` should be defined manually if needed. All other rules cannot be
241	//! overridden.
242	//!
243	//! ## `WHITESPACE` and `COMMENT`
244	//!
245	//! When defined, these rules get matched automatically in sequences (`~`) and repetitions
246	//! (``, `+`) between expressions. Atomic rules and those rules called by atomic rules are exempt*
247	//! from this behavior.
248	//!
249	//! These rules should be defined so as to match one whitespace character and one comment only since
250	//! they are run in repetitions.
251	//!
252	//! If both `WHITESPACE` and `COMMENT` are defined, this grammar:
253	//!
254	//! ```ignore
255	//! a = { b ~ c }
256	//! ```
257	//!
258	//! is effectively transformed into this one behind the scenes:
259	//!
260	//! ```ignore
261	//! a = { b ~ WHITESPACE* ~ (COMMENT ~ WHITESPACE) ~ c }
262	//! ```
263	//!
264	//! ## `PUSH`, `POP`, `DROP`, and `PEEK`
265	//!
266	//! `PUSH(e)` simply pushes the captured string of the expression `e` down a stack. This stack can
267	//! then later be used to match grammar based on its content with `POP` and `PEEK`.
268	//!
269	//! `PEEK` always matches the string at the top of stack. So, if the stack contains `["b", "a"]`
270	//! (`"a"` being on top), this grammar:
271	//!
272	//! ```ignore
273	//! a = { PEEK }
274	//! ```
275	//!
276	//! is effectively transformed into at parse time:
277	//!
278	//! ```ignore
279	//! a = { "a" }
280	//! ```
281	//!
282	//! `POP` works the same way with the exception that it pops the string off of the stack if the
283	//! match worked. With the stack from above, if `POP` matches `"a"`, the stack will be mutated
284	//! to `["b"]`.
285	//!
286	//! `DROP` makes it possible to remove the string at the top of the stack
287	//! without matching it. If the stack is nonempty, `DROP` drops the top of the
288	//! stack. If the stack is empty, then `DROP` fails to match.
289	//!
290	//! ### Advanced peeking
291	//!
292	//! `PEEK[start..end]` and `PEEK_ALL` allow to peek deeper into the stack. The syntax works exactly
293	//! like Rust’s exclusive slice syntax. Additionally, negative indices can be used to indicate an
294	//! offset from the top. If the end lies before or at the start, the expression matches (as does
295	//! a `PEEK_ALL` on an empty stack). With the stack `["c", "b", "a"]` (`"a"` on top):
296	//!
297	//! ```ignore
298	//! fill = PUSH("c") ~ PUSH("b") ~ PUSH("a")
299	//! v = { PEEK_ALL } = { "a" ~ "b" ~ "c" } // top to bottom
300	//! w = { PEEK[..] } = { "c" ~ "b" ~ "a" } // bottom to top
301	//! x = { PEEK[`1`..`2`] } = { PEEK[`1`..`-1`] } = { "b" }
302	//! y = { PEEK[..`-2`] } = { PEEK[`0`..`1`] } = { "a" }
303	//! z = { PEEK[`1`..] } = { PEEK[`-2`..`3`] } = { "c" ~ "b" }
304	//! n = { PEEK[`2`..`-2`] } = { PEEK[`2`..`1`] } = { "" }
305	//! ```
306	//!
307	//! For historical reasons, `PEEK_ALL` matches from top to bottom, while `PEEK[start..end]` matches
308	//! from bottom to top. There is currently no syntax to match a slice of the stack top to bottom.
309	//!
310	//! ## `Rule`
311	//!
312	//! All rules defined or used in the grammar populate a generated `enum` called `Rule`. This
313	//! implements `pest`'s `RuleType` and can be used throughout the API.
314	//!
315	//! ## `Built-in rules`
316	//!
317	//! Pest also comes with a number of built-in rules for convenience. They are:
318	//!
319	//! `ASCII_DIGIT` - matches a numeric character from 0..9*
320	//! `ASCII_NONZERO_DIGIT` - matches a numeric character from 1..9*
321	//! `ASCII_BIN_DIGIT` - matches a numeric character from 0..1*
322	//! `ASCII_OCT_DIGIT` - matches a numeric character from 0..7*
323	//! `ASCII_HEX_DIGIT` - matches a numeric character from 0..9 or a..f or A..F*
324	//! `ASCII_ALPHA_LOWER` - matches a character from a..z*
325	//! `ASCII_ALPHA_UPPER` - matches a character from A..Z*
326	//! `ASCII_ALPHA` - matches a character from a..z or A..Z*
327	//! `ASCII_ALPHANUMERIC` - matches a character from a..z or A..Z or 0..9*
328	//! `ASCII` - matches a character from \x00..\x7f*
329	//! `NEWLINE` - matches either "\n" or "\r\n" or "\r"*
330
331	#![doc(html_root_url = "https://docs.rs/pest")]
332
333	extern crate alloc;
334	#[cfg(feature = "std")]
335	extern crate std;
336
337	pub use crate::parser::Parser;
338	pub use crate::parser_state::{
339	set_call_limit, state, Atomicity, Lookahead, MatchDir, ParseResult, ParserState,
340	};
341	pub use crate::position::Position;
342	pub use crate::span::{Lines, LinesSpan, Span};
343	pub use crate::token::Token;
344	use core::fmt::Debug;
345	use core::hash::Hash;
346
347	pub mod error;
348	pub mod iterators;
349	mod macros;
350	mod parser;
351	mod parser_state;
352	mod position;
353	pub mod pratt_parser;
354	#[deprecated(
355	since = "2.4.0",
356	note = "Use `pest::pratt_parser` instead (it is an equivalent which also supports unary prefix/suffix operators).
357	While prec_climber is going to be kept in 2.x minor and patch releases, it may be removed in a future major release."
358	)]
359	pub mod prec_climber;
360	mod span;
361	mod stack;
362	mod token;
363
364	#[doc(hidden)]
365	pub mod unicode;
366
367	/// A trait which parser rules must implement.
368	///
369	/// This trait is set up so that any struct that implements all of its required traits will
370	/// automatically implement this trait as well.
371	///
372	/// This is essentially a [trait alias](https://github.com/rust-lang/rfcs/pull/1733). When trait
373	/// aliases are implemented, this may be replaced by one.
374	pub trait RuleType: Copy + Debug + Eq + Hash + Ord {}
375
376	impl<T: Copy + Debug + Eq + Hash + Ord> RuleType for T {}
377