1use crate::ast;
2use crate::hir;
3
4use crate::Result;
5
6/// A builder for a regular expression parser.
7///
8/// This builder permits modifying configuration options for the parser.
9///
10/// This type combines the builder options for both the
11/// [AST `ParserBuilder`](ast/parse/struct.ParserBuilder.html)
12/// and the
13/// [HIR `TranslatorBuilder`](hir/translate/struct.TranslatorBuilder.html).
14#[derive(Clone, Debug, Default)]
15pub struct ParserBuilder {
16 ast: ast::parse::ParserBuilder,
17 hir: hir::translate::TranslatorBuilder,
18}
19
20impl ParserBuilder {
21 /// Create a new parser builder with a default configuration.
22 pub fn new() -> ParserBuilder {
23 ParserBuilder::default()
24 }
25
26 /// Build a parser from this configuration with the given pattern.
27 pub fn build(&self) -> Parser {
28 Parser { ast: self.ast.build(), hir: self.hir.build() }
29 }
30
31 /// Set the nesting limit for this parser.
32 ///
33 /// The nesting limit controls how deep the abstract syntax tree is allowed
34 /// to be. If the AST exceeds the given limit (e.g., with too many nested
35 /// groups), then an error is returned by the parser.
36 ///
37 /// The purpose of this limit is to act as a heuristic to prevent stack
38 /// overflow for consumers that do structural induction on an `Ast` using
39 /// explicit recursion. While this crate never does this (instead using
40 /// constant stack space and moving the call stack to the heap), other
41 /// crates may.
42 ///
43 /// This limit is not checked until the entire Ast is parsed. Therefore,
44 /// if callers want to put a limit on the amount of heap space used, then
45 /// they should impose a limit on the length, in bytes, of the concrete
46 /// pattern string. In particular, this is viable since this parser
47 /// implementation will limit itself to heap space proportional to the
48 /// length of the pattern string.
49 ///
50 /// Note that a nest limit of `0` will return a nest limit error for most
51 /// patterns but not all. For example, a nest limit of `0` permits `a` but
52 /// not `ab`, since `ab` requires a concatenation, which results in a nest
53 /// depth of `1`. In general, a nest limit is not something that manifests
54 /// in an obvious way in the concrete syntax, therefore, it should not be
55 /// used in a granular way.
56 pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
57 self.ast.nest_limit(limit);
58 self
59 }
60
61 /// Whether to support octal syntax or not.
62 ///
63 /// Octal syntax is a little-known way of uttering Unicode codepoints in
64 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
65 /// `\141` are all equivalent regular expressions, where the last example
66 /// shows octal syntax.
67 ///
68 /// While supporting octal syntax isn't in and of itself a problem, it does
69 /// make good error messages harder. That is, in PCRE based regex engines,
70 /// syntax like `\0` invokes a backreference, which is explicitly
71 /// unsupported in Rust's regex engine. However, many users expect it to
72 /// be supported. Therefore, when octal support is disabled, the error
73 /// message will explicitly mention that backreferences aren't supported.
74 ///
75 /// Octal syntax is disabled by default.
76 pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
77 self.ast.octal(yes);
78 self
79 }
80
81 /// When enabled, the parser will permit the construction of a regular
82 /// expression that may match invalid UTF-8.
83 ///
84 /// When disabled (the default), the parser is guaranteed to produce
85 /// an expression that will only ever match valid UTF-8 (otherwise, the
86 /// parser will return an error).
87 ///
88 /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII
89 /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause
90 /// the parser to return an error. Namely, a negated ASCII word boundary
91 /// can result in matching positions that aren't valid UTF-8 boundaries.
92 pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder {
93 self.hir.allow_invalid_utf8(yes);
94 self
95 }
96
97 /// Enable verbose mode in the regular expression.
98 ///
99 /// When enabled, verbose mode permits insignificant whitespace in many
100 /// places in the regular expression, as well as comments. Comments are
101 /// started using `#` and continue until the end of the line.
102 ///
103 /// By default, this is disabled. It may be selectively enabled in the
104 /// regular expression by using the `x` flag regardless of this setting.
105 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
106 self.ast.ignore_whitespace(yes);
107 self
108 }
109
110 /// Enable or disable the case insensitive flag by default.
111 ///
112 /// By default this is disabled. It may alternatively be selectively
113 /// enabled in the regular expression itself via the `i` flag.
114 pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder {
115 self.hir.case_insensitive(yes);
116 self
117 }
118
119 /// Enable or disable the multi-line matching flag by default.
120 ///
121 /// By default this is disabled. It may alternatively be selectively
122 /// enabled in the regular expression itself via the `m` flag.
123 pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder {
124 self.hir.multi_line(yes);
125 self
126 }
127
128 /// Enable or disable the "dot matches any character" flag by default.
129 ///
130 /// By default this is disabled. It may alternatively be selectively
131 /// enabled in the regular expression itself via the `s` flag.
132 pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder {
133 self.hir.dot_matches_new_line(yes);
134 self
135 }
136
137 /// Enable or disable the "swap greed" flag by default.
138 ///
139 /// By default this is disabled. It may alternatively be selectively
140 /// enabled in the regular expression itself via the `U` flag.
141 pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder {
142 self.hir.swap_greed(yes);
143 self
144 }
145
146 /// Enable or disable the Unicode flag (`u`) by default.
147 ///
148 /// By default this is **enabled**. It may alternatively be selectively
149 /// disabled in the regular expression itself via the `u` flag.
150 ///
151 /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
152 /// default), a regular expression will fail to parse if Unicode mode is
153 /// disabled and a sub-expression could possibly match invalid UTF-8.
154 pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder {
155 self.hir.unicode(yes);
156 self
157 }
158}
159
160/// A convenience parser for regular expressions.
161///
162/// This parser takes as input a regular expression pattern string (the
163/// "concrete syntax") and returns a high-level intermediate representation
164/// (the HIR) suitable for most types of analysis. In particular, this parser
165/// hides the intermediate state of producing an AST (the "abstract syntax").
166/// The AST is itself far more complex than the HIR, so this parser serves as a
167/// convenience for never having to deal with it at all.
168///
169/// If callers have more fine grained use cases that need an AST, then please
170/// see the [`ast::parse`](ast/parse/index.html) module.
171///
172/// A `Parser` can be configured in more detail via a
173/// [`ParserBuilder`](struct.ParserBuilder.html).
174#[derive(Clone, Debug)]
175pub struct Parser {
176 ast: ast::parse::Parser,
177 hir: hir::translate::Translator,
178}
179
180impl Parser {
181 /// Create a new parser with a default configuration.
182 ///
183 /// The parser can be run with `parse` method. The parse method returns
184 /// a high level intermediate representation of the given regular
185 /// expression.
186 ///
187 /// To set configuration options on the parser, use
188 /// [`ParserBuilder`](struct.ParserBuilder.html).
189 pub fn new() -> Parser {
190 ParserBuilder::new().build()
191 }
192
193 /// Parse the regular expression into a high level intermediate
194 /// representation.
195 pub fn parse(&mut self, pattern: &str) -> Result<hir::Hir> {
196 let ast = self.ast.parse(pattern)?;
197 let hir = self.hir.translate(pattern, &ast)?;
198 Ok(hir)
199 }
200}
201