| 1 | use crate::{ast, hir, Error}; | 
| 2 |  | 
|---|
| 3 | /// A convenience routine for parsing a regex using default options. | 
|---|
| 4 | /// | 
|---|
| 5 | /// This is equivalent to `Parser::new().parse(pattern)`. | 
|---|
| 6 | /// | 
|---|
| 7 | /// If you need to set non-default options, then use a [`ParserBuilder`]. | 
|---|
| 8 | /// | 
|---|
| 9 | /// This routine returns an [`Hir`](hir::Hir) value. Namely, it automatically | 
|---|
| 10 | /// parses the pattern as an [`Ast`](ast::Ast) and then invokes the translator | 
|---|
| 11 | /// to convert the `Ast` into an `Hir`. If you need access to the `Ast`, then | 
|---|
| 12 | /// you should use a [`ast::parse::Parser`]. | 
|---|
| 13 | pub fn parse(pattern: &str) -> Result<hir::Hir, Error> { | 
|---|
| 14 | Parser::new().parse(pattern) | 
|---|
| 15 | } | 
|---|
| 16 |  | 
|---|
| 17 | /// A builder for a regular expression parser. | 
|---|
| 18 | /// | 
|---|
| 19 | /// This builder permits modifying configuration options for the parser. | 
|---|
| 20 | /// | 
|---|
| 21 | /// This type combines the builder options for both the [AST | 
|---|
| 22 | /// `ParserBuilder`](ast::parse::ParserBuilder) and the [HIR | 
|---|
| 23 | /// `TranslatorBuilder`](hir::translate::TranslatorBuilder). | 
|---|
| 24 | #[ derive(Clone, Debug, Default)] | 
|---|
| 25 | pub struct ParserBuilder { | 
|---|
| 26 | ast: ast::parse::ParserBuilder, | 
|---|
| 27 | hir: hir::translate::TranslatorBuilder, | 
|---|
| 28 | } | 
|---|
| 29 |  | 
|---|
| 30 | impl ParserBuilder { | 
|---|
| 31 | /// Create a new parser builder with a default configuration. | 
|---|
| 32 | pub fn new() -> ParserBuilder { | 
|---|
| 33 | ParserBuilder::default() | 
|---|
| 34 | } | 
|---|
| 35 |  | 
|---|
| 36 | /// Build a parser from this configuration with the given pattern. | 
|---|
| 37 | pub fn build(&self) -> Parser { | 
|---|
| 38 | Parser { ast: self.ast.build(), hir: self.hir.build() } | 
|---|
| 39 | } | 
|---|
| 40 |  | 
|---|
| 41 | /// Set the nesting limit for this parser. | 
|---|
| 42 | /// | 
|---|
| 43 | /// The nesting limit controls how deep the abstract syntax tree is allowed | 
|---|
| 44 | /// to be. If the AST exceeds the given limit (e.g., with too many nested | 
|---|
| 45 | /// groups), then an error is returned by the parser. | 
|---|
| 46 | /// | 
|---|
| 47 | /// The purpose of this limit is to act as a heuristic to prevent stack | 
|---|
| 48 | /// overflow for consumers that do structural induction on an `Ast` using | 
|---|
| 49 | /// explicit recursion. While this crate never does this (instead using | 
|---|
| 50 | /// constant stack space and moving the call stack to the heap), other | 
|---|
| 51 | /// crates may. | 
|---|
| 52 | /// | 
|---|
| 53 | /// This limit is not checked until the entire Ast is parsed. Therefore, | 
|---|
| 54 | /// if callers want to put a limit on the amount of heap space used, then | 
|---|
| 55 | /// they should impose a limit on the length, in bytes, of the concrete | 
|---|
| 56 | /// pattern string. In particular, this is viable since this parser | 
|---|
| 57 | /// implementation will limit itself to heap space proportional to the | 
|---|
| 58 | /// length of the pattern string. | 
|---|
| 59 | /// | 
|---|
| 60 | /// Note that a nest limit of `0` will return a nest limit error for most | 
|---|
| 61 | /// patterns but not all. For example, a nest limit of `0` permits `a` but | 
|---|
| 62 | /// not `ab`, since `ab` requires a concatenation, which results in a nest | 
|---|
| 63 | /// depth of `1`. In general, a nest limit is not something that manifests | 
|---|
| 64 | /// in an obvious way in the concrete syntax, therefore, it should not be | 
|---|
| 65 | /// used in a granular way. | 
|---|
| 66 | pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { | 
|---|
| 67 | self.ast.nest_limit(limit); | 
|---|
| 68 | self | 
|---|
| 69 | } | 
|---|
| 70 |  | 
|---|
| 71 | /// Whether to support octal syntax or not. | 
|---|
| 72 | /// | 
|---|
| 73 | /// Octal syntax is a little-known way of uttering Unicode codepoints in | 
|---|
| 74 | /// a regular expression. For example, `a`, `\x61`, `\u0061` and | 
|---|
| 75 | /// `\141` are all equivalent regular expressions, where the last example | 
|---|
| 76 | /// shows octal syntax. | 
|---|
| 77 | /// | 
|---|
| 78 | /// While supporting octal syntax isn't in and of itself a problem, it does | 
|---|
| 79 | /// make good error messages harder. That is, in PCRE based regex engines, | 
|---|
| 80 | /// syntax like `\0` invokes a backreference, which is explicitly | 
|---|
| 81 | /// unsupported in Rust's regex engine. However, many users expect it to | 
|---|
| 82 | /// be supported. Therefore, when octal support is disabled, the error | 
|---|
| 83 | /// message will explicitly mention that backreferences aren't supported. | 
|---|
| 84 | /// | 
|---|
| 85 | /// Octal syntax is disabled by default. | 
|---|
| 86 | pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { | 
|---|
| 87 | self.ast.octal(yes); | 
|---|
| 88 | self | 
|---|
| 89 | } | 
|---|
| 90 |  | 
|---|
| 91 | /// When disabled, translation will permit the construction of a regular | 
|---|
| 92 | /// expression that may match invalid UTF-8. | 
|---|
| 93 | /// | 
|---|
| 94 | /// When enabled (the default), the translator is guaranteed to produce an | 
|---|
| 95 | /// expression that, for non-empty matches, will only ever produce spans | 
|---|
| 96 | /// that are entirely valid UTF-8 (otherwise, the translator will return an | 
|---|
| 97 | /// error). | 
|---|
| 98 | /// | 
|---|
| 99 | /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even | 
|---|
| 100 | /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete | 
|---|
| 101 | /// syntax) will be allowed even though they can produce matches that split | 
|---|
| 102 | /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty" | 
|---|
| 103 | /// matches, and it is expected that the regex engine itself must handle | 
|---|
| 104 | /// these cases if necessary (perhaps by suppressing any zero-width matches | 
|---|
| 105 | /// that split a codepoint). | 
|---|
| 106 | pub fn utf8(&mut self, yes: bool) -> &mut ParserBuilder { | 
|---|
| 107 | self.hir.utf8(yes); | 
|---|
| 108 | self | 
|---|
| 109 | } | 
|---|
| 110 |  | 
|---|
| 111 | /// Enable verbose mode in the regular expression. | 
|---|
| 112 | /// | 
|---|
| 113 | /// When enabled, verbose mode permits insignificant whitespace in many | 
|---|
| 114 | /// places in the regular expression, as well as comments. Comments are | 
|---|
| 115 | /// started using `#` and continue until the end of the line. | 
|---|
| 116 | /// | 
|---|
| 117 | /// By default, this is disabled. It may be selectively enabled in the | 
|---|
| 118 | /// regular expression by using the `x` flag regardless of this setting. | 
|---|
| 119 | pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { | 
|---|
| 120 | self.ast.ignore_whitespace(yes); | 
|---|
| 121 | self | 
|---|
| 122 | } | 
|---|
| 123 |  | 
|---|
| 124 | /// Enable or disable the case insensitive flag by default. | 
|---|
| 125 | /// | 
|---|
| 126 | /// By default this is disabled. It may alternatively be selectively | 
|---|
| 127 | /// enabled in the regular expression itself via the `i` flag. | 
|---|
| 128 | pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder { | 
|---|
| 129 | self.hir.case_insensitive(yes); | 
|---|
| 130 | self | 
|---|
| 131 | } | 
|---|
| 132 |  | 
|---|
| 133 | /// Enable or disable the multi-line matching flag by default. | 
|---|
| 134 | /// | 
|---|
| 135 | /// By default this is disabled. It may alternatively be selectively | 
|---|
| 136 | /// enabled in the regular expression itself via the `m` flag. | 
|---|
| 137 | pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder { | 
|---|
| 138 | self.hir.multi_line(yes); | 
|---|
| 139 | self | 
|---|
| 140 | } | 
|---|
| 141 |  | 
|---|
| 142 | /// Enable or disable the "dot matches any character" flag by default. | 
|---|
| 143 | /// | 
|---|
| 144 | /// By default this is disabled. It may alternatively be selectively | 
|---|
| 145 | /// enabled in the regular expression itself via the `s` flag. | 
|---|
| 146 | pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder { | 
|---|
| 147 | self.hir.dot_matches_new_line(yes); | 
|---|
| 148 | self | 
|---|
| 149 | } | 
|---|
| 150 |  | 
|---|
| 151 | /// Enable or disable the CRLF mode flag by default. | 
|---|
| 152 | /// | 
|---|
| 153 | /// By default this is disabled. It may alternatively be selectively | 
|---|
| 154 | /// enabled in the regular expression itself via the `R` flag. | 
|---|
| 155 | /// | 
|---|
| 156 | /// When CRLF mode is enabled, the following happens: | 
|---|
| 157 | /// | 
|---|
| 158 | /// * Unless `dot_matches_new_line` is enabled, `.` will match any character | 
|---|
| 159 | /// except for `\r` and `\n`. | 
|---|
| 160 | /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`, | 
|---|
| 161 | /// `\r` and `\n` as line terminators. And in particular, neither will | 
|---|
| 162 | /// match between a `\r` and a `\n`. | 
|---|
| 163 | pub fn crlf(&mut self, yes: bool) -> &mut ParserBuilder { | 
|---|
| 164 | self.hir.crlf(yes); | 
|---|
| 165 | self | 
|---|
| 166 | } | 
|---|
| 167 |  | 
|---|
| 168 | /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. | 
|---|
| 169 | /// | 
|---|
| 170 | /// Namely, instead of `.` (by default) matching everything except for `\n`, | 
|---|
| 171 | /// this will cause `.` to match everything except for the byte given. | 
|---|
| 172 | /// | 
|---|
| 173 | /// If `.` is used in a context where Unicode mode is enabled and this byte | 
|---|
| 174 | /// isn't ASCII, then an error will be returned. When Unicode mode is | 
|---|
| 175 | /// disabled, then any byte is permitted, but will return an error if UTF-8 | 
|---|
| 176 | /// mode is enabled and it is a non-ASCII byte. | 
|---|
| 177 | /// | 
|---|
| 178 | /// In short, any ASCII value for a line terminator is always okay. But a | 
|---|
| 179 | /// non-ASCII byte might result in an error depending on whether Unicode | 
|---|
| 180 | /// mode or UTF-8 mode are enabled. | 
|---|
| 181 | /// | 
|---|
| 182 | /// Note that if `R` mode is enabled then it always takes precedence and | 
|---|
| 183 | /// the line terminator will be treated as `\r` and `\n` simultaneously. | 
|---|
| 184 | /// | 
|---|
| 185 | /// Note also that this *doesn't* impact the look-around assertions | 
|---|
| 186 | /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional | 
|---|
| 187 | /// configuration in the regex engine itself. | 
|---|
| 188 | pub fn line_terminator(&mut self, byte: u8) -> &mut ParserBuilder { | 
|---|
| 189 | self.hir.line_terminator(byte); | 
|---|
| 190 | self | 
|---|
| 191 | } | 
|---|
| 192 |  | 
|---|
| 193 | /// Enable or disable the "swap greed" flag by default. | 
|---|
| 194 | /// | 
|---|
| 195 | /// By default this is disabled. It may alternatively be selectively | 
|---|
| 196 | /// enabled in the regular expression itself via the `U` flag. | 
|---|
| 197 | pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder { | 
|---|
| 198 | self.hir.swap_greed(yes); | 
|---|
| 199 | self | 
|---|
| 200 | } | 
|---|
| 201 |  | 
|---|
| 202 | /// Enable or disable the Unicode flag (`u`) by default. | 
|---|
| 203 | /// | 
|---|
| 204 | /// By default this is **enabled**. It may alternatively be selectively | 
|---|
| 205 | /// disabled in the regular expression itself via the `u` flag. | 
|---|
| 206 | /// | 
|---|
| 207 | /// Note that unless `utf8` is disabled (it's enabled by default), a | 
|---|
| 208 | /// regular expression will fail to parse if Unicode mode is disabled and a | 
|---|
| 209 | /// sub-expression could possibly match invalid UTF-8. | 
|---|
| 210 | pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder { | 
|---|
| 211 | self.hir.unicode(yes); | 
|---|
| 212 | self | 
|---|
| 213 | } | 
|---|
| 214 | } | 
|---|
| 215 |  | 
|---|
| 216 | /// A convenience parser for regular expressions. | 
|---|
| 217 | /// | 
|---|
| 218 | /// This parser takes as input a regular expression pattern string (the | 
|---|
| 219 | /// "concrete syntax") and returns a high-level intermediate representation | 
|---|
| 220 | /// (the HIR) suitable for most types of analysis. In particular, this parser | 
|---|
| 221 | /// hides the intermediate state of producing an AST (the "abstract syntax"). | 
|---|
| 222 | /// The AST is itself far more complex than the HIR, so this parser serves as a | 
|---|
| 223 | /// convenience for never having to deal with it at all. | 
|---|
| 224 | /// | 
|---|
| 225 | /// If callers have more fine grained use cases that need an AST, then please | 
|---|
| 226 | /// see the [`ast::parse`] module. | 
|---|
| 227 | /// | 
|---|
| 228 | /// A `Parser` can be configured in more detail via a [`ParserBuilder`]. | 
|---|
| 229 | #[ derive(Clone, Debug)] | 
|---|
| 230 | pub struct Parser { | 
|---|
| 231 | ast: ast::parse::Parser, | 
|---|
| 232 | hir: hir::translate::Translator, | 
|---|
| 233 | } | 
|---|
| 234 |  | 
|---|
| 235 | impl Parser { | 
|---|
| 236 | /// Create a new parser with a default configuration. | 
|---|
| 237 | /// | 
|---|
| 238 | /// The parser can be run with `parse` method. The parse method returns | 
|---|
| 239 | /// a high level intermediate representation of the given regular | 
|---|
| 240 | /// expression. | 
|---|
| 241 | /// | 
|---|
| 242 | /// To set configuration options on the parser, use [`ParserBuilder`]. | 
|---|
| 243 | pub fn new() -> Parser { | 
|---|
| 244 | ParserBuilder::new().build() | 
|---|
| 245 | } | 
|---|
| 246 |  | 
|---|
| 247 | /// Parse the regular expression into a high level intermediate | 
|---|
| 248 | /// representation. | 
|---|
| 249 | pub fn parse(&mut self, pattern: &str) -> Result<hir::Hir, Error> { | 
|---|
| 250 | let ast: Ast = self.ast.parse(pattern)?; | 
|---|
| 251 | let hir: Hir = self.hir.translate(pattern, &ast)?; | 
|---|
| 252 | Ok(hir) | 
|---|
| 253 | } | 
|---|
| 254 | } | 
|---|
| 255 |  | 
|---|