syntax.rs source code [crates/regex-automata-0.4.9/src/util/syntax.rs]

1	/!*
2	Utilities for dealing with the syntax of a regular expression.
3
4	This module currently only exposes a [`Config`] type that
5	itself represents a wrapper around the configuration for a
6	[`regex-syntax::ParserBuilder`](regex_syntax::ParserBuilder). The purpose of
7	this wrapper is to make configuring syntax options very similar to how other
8	configuration is done throughout this crate. Namely, instead of duplicating
9	syntax options across every builder (of which there are many), we instead
10	create small config objects like this one that can be passed around and
11	composed.
12	*/
13
14	use alloc::{vec, vec::Vec};
15
16	use regex_syntax::{
17	ast,
18	hir::{self, Hir},
19	Error, ParserBuilder,
20	};
21
22	/// A convenience routine for parsing a pattern into an HIR value with the
23	/// default configuration.
24	///
25	/// # Example
26	///
27	/// This shows how to parse a pattern into an HIR value:
28	///
29	/// ```
30	/// use regex_automata::util::syntax;
31	///
32	/// let hir = syntax::parse(r"([a-z]+)\|([0-9]+)")?;
33	/// assert_eq!(Some(`1`), hir.properties().static_explicit_captures_len());
34	///
35	/// # Ok::<(), Box<dyn std::error::Error>>(())
36	/// ```
37	pub fn parse(pattern: &str) -> Result<Hir, Error> {
38	parse_with(pattern, &Config::default())
39	}
40
41	/// A convenience routine for parsing many patterns into HIR value with the
42	/// default configuration.
43	///
44	/// # Example
45	///
46	/// This shows how to parse many patterns into an corresponding HIR values:
47	///
48	/// ```
49	/// use {
50	/// regex_automata::util::syntax,
51	/// regex_syntax::hir::Properties,
52	/// };
53	///
54	/// let hirs = syntax::parse_many(&[
55	/// r"([a-z]+)\|([0-9]+)",
56	/// r"foo(A-Z]+)bar",
57	/// ])?;
58	/// let props = Properties::union(hirs.iter().map(\|h\| h.properties()));
59	/// assert_eq!(Some(`1`), props.static_explicit_captures_len());
60	///
61	/// # Ok::<(), Box<dyn std::error::Error>>(())
62	/// ```
63	pub fn parse_many<P: AsRef<str>>(patterns: &[P]) -> Result<Vec<Hir>, Error> {
64	parse_many_with(patterns, &Config::default())
65	}
66
67	/// A convenience routine for parsing a pattern into an HIR value using a
68	/// `Config`.
69	///
70	/// # Example
71	///
72	/// This shows how to parse a pattern into an HIR value with a non-default
73	/// configuration:
74	///
75	/// ```
76	/// use regex_automata::util::syntax;
77	///
78	/// let hir = syntax::parse_with(
79	/// r"^[a-z]+$",
80	/// &syntax::Config::new().multi_line(`true`).crlf(`true`),
81	/// )?;
82	/// assert!(hir.properties().look_set().contains_anchor_crlf());
83	///
84	/// # Ok::<(), Box<dyn std::error::Error>>(())
85	/// ```
86	pub fn parse_with(pattern: &str, config: &Config) -> Result<Hir, Error> {
87	let mut builder: ParserBuilder = ParserBuilder::new();
88	config.apply(&mut builder);
89	builder.build().parse(pattern)
90	}
91
92	/// A convenience routine for parsing many patterns into HIR values using a
93	/// `Config`.
94	///
95	/// # Example
96	///
97	/// This shows how to parse many patterns into an corresponding HIR values
98	/// with a non-default configuration:
99	///
100	/// ```
101	/// use {
102	/// regex_automata::util::syntax,
103	/// regex_syntax::hir::Properties,
104	/// };
105	///
106	/// let patterns = &[
107	/// r"([a-z]+)\|([0-9]+)",
108	/// r"\W",
109	/// r"foo(A-Z]+)bar",
110	/// ];
111	/// let config = syntax::Config::new().unicode(`false`).utf8(`false`);
112	/// let hirs = syntax::parse_many_with(patterns, &config)?;
113	/// let props = Properties::union(hirs.iter().map(\|h\| h.properties()));
114	/// assert!(!props.is_utf8());
115	///
116	/// # Ok::<(), Box<dyn std::error::Error>>(())
117	/// ```
118	pub fn parse_many_with<P: AsRef<str>>(
119	patterns: &[P],
120	config: &Config,
121	) -> Result<Vec<Hir>, Error> {
122	let mut builder: ParserBuilder = ParserBuilder::new();
123	config.apply(&mut builder);
124	let mut hirs: Vec = vec![];
125	for p: &P in patterns.iter() {
126	hirs.push(builder.build().parse(pattern:p.as_ref())?);
127	}
128	Ok(hirs)
129	}
130
131	/// A common set of configuration options that apply to the syntax of a regex.
132	///
133	/// This represents a group of configuration options that specifically apply
134	/// to how the concrete syntax of a regular expression is interpreted. In
135	/// particular, they are generally forwarded to the
136	/// [`ParserBuilder`](https://docs.rs/regex-syntax//regex_syntax/struct.ParserBuilder.html)*
137	/// in the
138	/// [`regex-syntax`](https://docs.rs/regex-syntax)
139	/// crate when building a regex from its concrete syntax directly.
140	///
141	/// These options are defined as a group since they apply to every regex engine
142	/// in this crate. Instead of re-defining them on every engine's builder, they
143	/// are instead provided here as one cohesive unit.
144	#[derive(Clone, Copy, Debug)]
145	pub struct Config {
146	case_insensitive: bool,
147	multi_line: bool,
148	dot_matches_new_line: bool,
149	crlf: bool,
150	line_terminator: u8,
151	swap_greed: bool,
152	ignore_whitespace: bool,
153	unicode: bool,
154	utf8: bool,
155	nest_limit: u32,
156	octal: bool,
157	}
158
159	impl Config {
160	/// Return a new default syntax configuration.
161	pub fn new() -> Config {
162	// These defaults match the ones used in regex-syntax.
163	Config {
164	case_insensitive: `false`,
165	multi_line: `false`,
166	dot_matches_new_line: `false`,
167	crlf: `false`,
168	line_terminator: b'`\n`',
169	swap_greed: `false`,
170	ignore_whitespace: `false`,
171	unicode: `true`,
172	utf8: `true`,
173	nest_limit: `250`,
174	octal: `false`,
175	}
176	}
177
178	/// Enable or disable the case insensitive flag by default.
179	///
180	/// When Unicode mode is enabled, case insensitivity is Unicode-aware.
181	/// Specifically, it will apply the "simple" case folding rules as
182	/// specified by Unicode.
183	///
184	/// By default this is disabled. It may alternatively be selectively
185	/// enabled in the regular expression itself via the `i` flag.
186	pub fn case_insensitive(mut self, yes: bool) -> Config {
187	self.case_insensitive = yes;
188	self
189	}
190
191	/// Enable or disable the multi-line matching flag by default.
192	///
193	/// When this is enabled, the `^` and `$` look-around assertions will
194	/// match immediately after and immediately before a new line character,
195	/// respectively. Note that the `\A` and `\z` look-around assertions are
196	/// unaffected by this setting and always correspond to matching at the
197	/// beginning and end of the input.
198	///
199	/// By default this is disabled. It may alternatively be selectively
200	/// enabled in the regular expression itself via the `m` flag.
201	pub fn multi_line(mut self, yes: bool) -> Config {
202	self.multi_line = yes;
203	self
204	}
205
206	/// Enable or disable the "dot matches any character" flag by default.
207	///
208	/// When this is enabled, `.` will match any character. When it's disabled,
209	/// then `.` will match any character except for a new line character.
210	///
211	/// Note that `.` is impacted by whether the "unicode" setting is enabled
212	/// or not. When Unicode is enabled (the default), `.` will match any UTF-8
213	/// encoding of any Unicode scalar value (sans a new line, depending on
214	/// whether this "dot matches new line" option is enabled). When Unicode
215	/// mode is disabled, `.` will match any byte instead. Because of this,
216	/// when Unicode mode is disabled, `.` can only be used when the "allow
217	/// invalid UTF-8" option is enabled, since `.` could otherwise match
218	/// invalid UTF-8.
219	///
220	/// By default this is disabled. It may alternatively be selectively
221	/// enabled in the regular expression itself via the `s` flag.
222	pub fn dot_matches_new_line(mut self, yes: bool) -> Config {
223	self.dot_matches_new_line = yes;
224	self
225	}
226
227	/// Enable or disable the "CRLF mode" flag by default.
228	///
229	/// By default this is disabled. It may alternatively be selectively
230	/// enabled in the regular expression itself via the `R` flag.
231	///
232	/// When CRLF mode is enabled, the following happens:
233	///
234	/// Unless `dot_matches_new_line` is enabled, `.` will match any character*
235	/// except for `\r` and `\n`.
236	/// When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`,*
237	/// `\r` and `\n` as line terminators. And in particular, neither will
238	/// match between a `\r` and a `\n`.
239	pub fn crlf(mut self, yes: bool) -> Config {
240	self.crlf = yes;
241	self
242	}
243
244	/// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
245	///
246	/// Namely, instead of `.` (by default) matching everything except for `\n`,
247	/// this will cause `.` to match everything except for the byte given.
248	///
249	/// If `.` is used in a context where Unicode mode is enabled and this byte
250	/// isn't ASCII, then an error will be returned. When Unicode mode is
251	/// disabled, then any byte is permitted, but will return an error if UTF-8
252	/// mode is enabled and it is a non-ASCII byte.
253	///
254	/// In short, any ASCII value for a line terminator is always okay. But a
255	/// non-ASCII byte might result in an error depending on whether Unicode
256	/// mode or UTF-8 mode are enabled.
257	///
258	/// Note that if `R` mode is enabled then it always takes precedence and
259	/// the line terminator will be treated as `\r` and `\n` simultaneously.
260	///
261	/// Note also that this doesn't* impact the look-around assertions*
262	/// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
263	/// configuration in the regex engine itself.
264	pub fn line_terminator(mut self, byte: u8) -> Config {
265	self.line_terminator = byte;
266	self
267	}
268
269	/// Enable or disable the "swap greed" flag by default.
270	///
271	/// When this is enabled, `.` (for example) will become ungreedy and `.?`
272	/// will become greedy.
273	///
274	/// By default this is disabled. It may alternatively be selectively
275	/// enabled in the regular expression itself via the `U` flag.
276	pub fn swap_greed(mut self, yes: bool) -> Config {
277	self.swap_greed = yes;
278	self
279	}
280
281	/// Enable verbose mode in the regular expression.
282	///
283	/// When enabled, verbose mode permits insigificant whitespace in many
284	/// places in the regular expression, as well as comments. Comments are
285	/// started using `#` and continue until the end of the line.
286	///
287	/// By default, this is disabled. It may be selectively enabled in the
288	/// regular expression by using the `x` flag regardless of this setting.
289	pub fn ignore_whitespace(mut self, yes: bool) -> Config {
290	self.ignore_whitespace = yes;
291	self
292	}
293
294	/// Enable or disable the Unicode flag (`u`) by default.
295	///
296	/// By default this is enabled. It may alternatively be selectively
297	/// disabled in the regular expression itself via the `u` flag.
298	///
299	/// Note that unless "allow invalid UTF-8" is enabled (it's disabled by
300	/// default), a regular expression will fail to parse if Unicode mode is
301	/// disabled and a sub-expression could possibly match invalid UTF-8.
302	///
303	/// WARNING: Unicode mode can greatly increase the size of the compiled
304	/// DFA, which can noticeably impact both memory usage and compilation
305	/// time. This is especially noticeable if your regex contains character
306	/// classes like `\w` that are impacted by whether Unicode is enabled or
307	/// not. If Unicode is not necessary, you are encouraged to disable it.
308	pub fn unicode(mut self, yes: bool) -> Config {
309	self.unicode = yes;
310	self
311	}
312
313	/// When disabled, the builder will permit the construction of a regular
314	/// expression that may match invalid UTF-8.
315	///
316	/// For example, when [`Config::unicode`] is disabled, then
317	/// expressions like `[^a]` may match invalid UTF-8 since they can match
318	/// any single byte that is not `a`. By default, these sub-expressions
319	/// are disallowed to avoid returning offsets that split a UTF-8
320	/// encoded codepoint. However, in cases where matching at arbitrary
321	/// locations is desired, this option can be disabled to permit all such
322	/// sub-expressions.
323	///
324	/// When enabled (the default), the builder is guaranteed to produce a
325	/// regex that will only ever match valid UTF-8 (otherwise, the builder
326	/// will return an error).
327	pub fn utf8(mut self, yes: bool) -> Config {
328	self.utf8 = yes;
329	self
330	}
331
332	/// Set the nesting limit used for the regular expression parser.
333	///
334	/// The nesting limit controls how deep the abstract syntax tree is allowed
335	/// to be. If the AST exceeds the given limit (e.g., with too many nested
336	/// groups), then an error is returned by the parser.
337	///
338	/// The purpose of this limit is to act as a heuristic to prevent stack
339	/// overflow when building a finite automaton from a regular expression's
340	/// abstract syntax tree. In particular, construction currently uses
341	/// recursion. In the future, the implementation may stop using recursion
342	/// and this option will no longer be necessary.
343	///
344	/// This limit is not checked until the entire AST is parsed. Therefore,
345	/// if callers want to put a limit on the amount of heap space used, then
346	/// they should impose a limit on the length, in bytes, of the concrete
347	/// pattern string. In particular, this is viable since the parser will
348	/// limit itself to heap space proportional to the length of the pattern
349	/// string.
350	///
351	/// Note that a nest limit of `0` will return a nest limit error for most
352	/// patterns but not all. For example, a nest limit of `0` permits `a` but
353	/// not `ab`, since `ab` requires a concatenation AST item, which results
354	/// in a nest depth of `1`. In general, a nest limit is not something that
355	/// manifests in an obvious way in the concrete syntax, therefore, it
356	/// should not be used in a granular way.
357	pub fn nest_limit(mut self, limit: u32) -> Config {
358	self.nest_limit = limit;
359	self
360	}
361
362	/// Whether to support octal syntax or not.
363	///
364	/// Octal syntax is a little-known way of uttering Unicode codepoints in
365	/// a regular expression. For example, `a`, `\x61`, `\u0061` and
366	/// `\141` are all equivalent regular expressions, where the last example
367	/// shows octal syntax.
368	///
369	/// While supporting octal syntax isn't in and of itself a problem, it does
370	/// make good error messages harder. That is, in PCRE based regex engines,
371	/// syntax like `\1` invokes a backreference, which is explicitly
372	/// unsupported in Rust's regex engine. However, many users expect it to
373	/// be supported. Therefore, when octal support is disabled, the error
374	/// message will explicitly mention that backreferences aren't supported.
375	///
376	/// Octal syntax is disabled by default.
377	pub fn octal(mut self, yes: bool) -> Config {
378	self.octal = yes;
379	self
380	}
381
382	/// Returns whether "unicode" mode is enabled.
383	pub fn get_unicode(&self) -> bool {
384	self.unicode
385	}
386
387	/// Returns whether "case insensitive" mode is enabled.
388	pub fn get_case_insensitive(&self) -> bool {
389	self.case_insensitive
390	}
391
392	/// Returns whether "multi line" mode is enabled.
393	pub fn get_multi_line(&self) -> bool {
394	self.multi_line
395	}
396
397	/// Returns whether "dot matches new line" mode is enabled.
398	pub fn get_dot_matches_new_line(&self) -> bool {
399	self.dot_matches_new_line
400	}
401
402	/// Returns whether "CRLF" mode is enabled.
403	pub fn get_crlf(&self) -> bool {
404	self.crlf
405	}
406
407	/// Returns the line terminator in this syntax configuration.
408	pub fn get_line_terminator(&self) -> u8 {
409	self.line_terminator
410	}
411
412	/// Returns whether "swap greed" mode is enabled.
413	pub fn get_swap_greed(&self) -> bool {
414	self.swap_greed
415	}
416
417	/// Returns whether "ignore whitespace" mode is enabled.
418	pub fn get_ignore_whitespace(&self) -> bool {
419	self.ignore_whitespace
420	}
421
422	/// Returns whether UTF-8 mode is enabled.
423	pub fn get_utf8(&self) -> bool {
424	self.utf8
425	}
426
427	/// Returns the "nest limit" setting.
428	pub fn get_nest_limit(&self) -> u32 {
429	self.nest_limit
430	}
431
432	/// Returns whether "octal" mode is enabled.
433	pub fn get_octal(&self) -> bool {
434	self.octal
435	}
436
437	/// Applies this configuration to the given parser.
438	pub(crate) fn apply(&self, builder: &mut ParserBuilder) {
439	builder
440	.unicode(self.unicode)
441	.case_insensitive(self.case_insensitive)
442	.multi_line(self.multi_line)
443	.dot_matches_new_line(self.dot_matches_new_line)
444	.crlf(self.crlf)
445	.line_terminator(self.line_terminator)
446	.swap_greed(self.swap_greed)
447	.ignore_whitespace(self.ignore_whitespace)
448	.utf8(self.utf8)
449	.nest_limit(self.nest_limit)
450	.octal(self.octal);
451	}
452
453	/// Applies this configuration to the given AST parser.
454	pub(crate) fn apply_ast(&self, builder: &mut ast::parse::ParserBuilder) {
455	builder
456	.ignore_whitespace(self.ignore_whitespace)
457	.nest_limit(self.nest_limit)
458	.octal(self.octal);
459	}
460
461	/// Applies this configuration to the given AST-to-HIR translator.
462	pub(crate) fn apply_hir(
463	&self,
464	builder: &mut hir::translate::TranslatorBuilder,
465	) {
466	builder
467	.unicode(self.unicode)
468	.case_insensitive(self.case_insensitive)
469	.multi_line(self.multi_line)
470	.crlf(self.crlf)
471	.dot_matches_new_line(self.dot_matches_new_line)
472	.line_terminator(self.line_terminator)
473	.swap_greed(self.swap_greed)
474	.utf8(self.utf8);
475	}
476	}
477
478	impl Default for Config {
479	fn default() -> Config {
480	Config::new()
481	}
482	}
483