parser.rs source code [crates/regex-syntax/src/parser.rs]

1	use crate::{ast, hir, Error};
2
3	/// A convenience routine for parsing a regex using default options.
4	///
5	/// This is equivalent to `Parser::new().parse(pattern)`.
6	///
7	/// If you need to set non-default options, then use a [`ParserBuilder`].
8	///
9	/// This routine returns an [`Hir`](hir::Hir) value. Namely, it automatically
10	/// parses the pattern as an [`Ast`](ast::Ast) and then invokes the translator
11	/// to convert the `Ast` into an `Hir`. If you need access to the `Ast`, then
12	/// you should use a [`ast::parse::Parser`].
13	pub fn parse(pattern: &str) -> Result<hir::Hir, Error> {
14	Parser::new().parse(pattern)
15	}
16
17	/// A builder for a regular expression parser.
18	///
19	/// This builder permits modifying configuration options for the parser.
20	///
21	/// This type combines the builder options for both the [AST
22	/// `ParserBuilder`](ast::parse::ParserBuilder) and the [HIR
23	/// `TranslatorBuilder`](hir::translate::TranslatorBuilder).
24	#[derive(Clone, Debug, Default)]
25	pub struct ParserBuilder {
26	ast: ast::parse::ParserBuilder,
27	hir: hir::translate::TranslatorBuilder,
28	}
29
30	impl ParserBuilder {
31	/// Create a new parser builder with a default configuration.
32	pub fn new() -> ParserBuilder {
33	ParserBuilder::default()
34	}
35
36	/// Build a parser from this configuration with the given pattern.
37	pub fn build(&self) -> Parser {
38	Parser { ast: self.ast.build(), hir: self.hir.build() }
39	}
40
41	/// Set the nesting limit for this parser.
42	///
43	/// The nesting limit controls how deep the abstract syntax tree is allowed
44	/// to be. If the AST exceeds the given limit (e.g., with too many nested
45	/// groups), then an error is returned by the parser.
46	///
47	/// The purpose of this limit is to act as a heuristic to prevent stack
48	/// overflow for consumers that do structural induction on an `Ast` using
49	/// explicit recursion. While this crate never does this (instead using
50	/// constant stack space and moving the call stack to the heap), other
51	/// crates may.
52	///
53	/// This limit is not checked until the entire Ast is parsed. Therefore,
54	/// if callers want to put a limit on the amount of heap space used, then
55	/// they should impose a limit on the length, in bytes, of the concrete
56	/// pattern string. In particular, this is viable since this parser
57	/// implementation will limit itself to heap space proportional to the
58	/// length of the pattern string.
59	///
60	/// Note that a nest limit of `0` will return a nest limit error for most
61	/// patterns but not all. For example, a nest limit of `0` permits `a` but
62	/// not `ab`, since `ab` requires a concatenation, which results in a nest
63	/// depth of `1`. In general, a nest limit is not something that manifests
64	/// in an obvious way in the concrete syntax, therefore, it should not be
65	/// used in a granular way.
66	pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
67	self.ast.nest_limit(limit);
68	self
69	}
70
71	/// Whether to support octal syntax or not.
72	///
73	/// Octal syntax is a little-known way of uttering Unicode codepoints in
74	/// a regular expression. For example, `a`, `\x61`, `\u0061` and
75	/// `\141` are all equivalent regular expressions, where the last example
76	/// shows octal syntax.
77	///
78	/// While supporting octal syntax isn't in and of itself a problem, it does
79	/// make good error messages harder. That is, in PCRE based regex engines,
80	/// syntax like `\0` invokes a backreference, which is explicitly
81	/// unsupported in Rust's regex engine. However, many users expect it to
82	/// be supported. Therefore, when octal support is disabled, the error
83	/// message will explicitly mention that backreferences aren't supported.
84	///
85	/// Octal syntax is disabled by default.
86	pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
87	self.ast.octal(yes);
88	self
89	}
90
91	/// When disabled, translation will permit the construction of a regular
92	/// expression that may match invalid UTF-8.
93	///
94	/// When enabled (the default), the translator is guaranteed to produce an
95	/// expression that, for non-empty matches, will only ever produce spans
96	/// that are entirely valid UTF-8 (otherwise, the translator will return an
97	/// error).
98	///
99	/// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even
100	/// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete
101	/// syntax) will be allowed even though they can produce matches that split
102	/// a UTF-8 encoded codepoint. This only applies to zero-width or "empty"
103	/// matches, and it is expected that the regex engine itself must handle
104	/// these cases if necessary (perhaps by suppressing any zero-width matches
105	/// that split a codepoint).
106	pub fn utf8(&mut self, yes: bool) -> &mut ParserBuilder {
107	self.hir.utf8(yes);
108	self
109	}
110
111	/// Enable verbose mode in the regular expression.
112	///
113	/// When enabled, verbose mode permits insignificant whitespace in many
114	/// places in the regular expression, as well as comments. Comments are
115	/// started using `#` and continue until the end of the line.
116	///
117	/// By default, this is disabled. It may be selectively enabled in the
118	/// regular expression by using the `x` flag regardless of this setting.
119	pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
120	self.ast.ignore_whitespace(yes);
121	self
122	}
123
124	/// Enable or disable the case insensitive flag by default.
125	///
126	/// By default this is disabled. It may alternatively be selectively
127	/// enabled in the regular expression itself via the `i` flag.
128	pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder {
129	self.hir.case_insensitive(yes);
130	self
131	}
132
133	/// Enable or disable the multi-line matching flag by default.
134	///
135	/// By default this is disabled. It may alternatively be selectively
136	/// enabled in the regular expression itself via the `m` flag.
137	pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder {
138	self.hir.multi_line(yes);
139	self
140	}
141
142	/// Enable or disable the "dot matches any character" flag by default.
143	///
144	/// By default this is disabled. It may alternatively be selectively
145	/// enabled in the regular expression itself via the `s` flag.
146	pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder {
147	self.hir.dot_matches_new_line(yes);
148	self
149	}
150
151	/// Enable or disable the CRLF mode flag by default.
152	///
153	/// By default this is disabled. It may alternatively be selectively
154	/// enabled in the regular expression itself via the `R` flag.
155	///
156	/// When CRLF mode is enabled, the following happens:
157	///
158	/// Unless `dot_matches_new_line` is enabled, `.` will match any character*
159	/// except for `\r` and `\n`.
160	/// When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`,*
161	/// `\r` and `\n` as line terminators. And in particular, neither will
162	/// match between a `\r` and a `\n`.
163	pub fn crlf(&mut self, yes: bool) -> &mut ParserBuilder {
164	self.hir.crlf(yes);
165	self
166	}
167
168	/// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
169	///
170	/// Namely, instead of `.` (by default) matching everything except for `\n`,
171	/// this will cause `.` to match everything except for the byte given.
172	///
173	/// If `.` is used in a context where Unicode mode is enabled and this byte
174	/// isn't ASCII, then an error will be returned. When Unicode mode is
175	/// disabled, then any byte is permitted, but will return an error if UTF-8
176	/// mode is enabled and it is a non-ASCII byte.
177	///
178	/// In short, any ASCII value for a line terminator is always okay. But a
179	/// non-ASCII byte might result in an error depending on whether Unicode
180	/// mode or UTF-8 mode are enabled.
181	///
182	/// Note that if `R` mode is enabled then it always takes precedence and
183	/// the line terminator will be treated as `\r` and `\n` simultaneously.
184	///
185	/// Note also that this doesn't* impact the look-around assertions*
186	/// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
187	/// configuration in the regex engine itself.
188	pub fn line_terminator(&mut self, byte: u8) -> &mut ParserBuilder {
189	self.hir.line_terminator(byte);
190	self
191	}
192
193	/// Enable or disable the "swap greed" flag by default.
194	///
195	/// By default this is disabled. It may alternatively be selectively
196	/// enabled in the regular expression itself via the `U` flag.
197	pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder {
198	self.hir.swap_greed(yes);
199	self
200	}
201
202	/// Enable or disable the Unicode flag (`u`) by default.
203	///
204	/// By default this is enabled. It may alternatively be selectively
205	/// disabled in the regular expression itself via the `u` flag.
206	///
207	/// Note that unless `utf8` is disabled (it's enabled by default), a
208	/// regular expression will fail to parse if Unicode mode is disabled and a
209	/// sub-expression could possibly match invalid UTF-8.
210	pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder {
211	self.hir.unicode(yes);
212	self
213	}
214	}
215
216	/// A convenience parser for regular expressions.
217	///
218	/// This parser takes as input a regular expression pattern string (the
219	/// "concrete syntax") and returns a high-level intermediate representation
220	/// (the HIR) suitable for most types of analysis. In particular, this parser
221	/// hides the intermediate state of producing an AST (the "abstract syntax").
222	/// The AST is itself far more complex than the HIR, so this parser serves as a
223	/// convenience for never having to deal with it at all.
224	///
225	/// If callers have more fine grained use cases that need an AST, then please
226	/// see the [`ast::parse`] module.
227	///
228	/// A `Parser` can be configured in more detail via a [`ParserBuilder`].
229	#[derive(Clone, Debug)]
230	pub struct Parser {
231	ast: ast::parse::Parser,
232	hir: hir::translate::Translator,
233	}
234
235	impl Parser {
236	/// Create a new parser with a default configuration.
237	///
238	/// The parser can be run with `parse` method. The parse method returns
239	/// a high level intermediate representation of the given regular
240	/// expression.
241	///
242	/// To set configuration options on the parser, use [`ParserBuilder`].
243	pub fn new() -> Parser {
244	ParserBuilder::new().build()
245	}
246
247	/// Parse the regular expression into a high level intermediate
248	/// representation.
249	pub fn parse(&mut self, pattern: &str) -> Result<hir::Hir, Error> {
250	let ast: Ast = self.ast.parse(pattern)?;
251	let hir: Hir = self.hir.translate(pattern, &ast)?;
252	Ok(hir)
253	}
254	}
255