| 1 | use core::{borrow::Borrow, cell::RefCell}; |
| 2 | |
| 3 | use alloc::{sync::Arc, vec, vec::Vec}; |
| 4 | |
| 5 | use regex_syntax::{ |
| 6 | hir::{self, Hir}, |
| 7 | utf8::{Utf8Range, Utf8Sequences}, |
| 8 | ParserBuilder, |
| 9 | }; |
| 10 | |
| 11 | use crate::{ |
| 12 | nfa::thompson::{ |
| 13 | builder::Builder, |
| 14 | error::BuildError, |
| 15 | literal_trie::LiteralTrie, |
| 16 | map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap}, |
| 17 | nfa::{Transition, NFA}, |
| 18 | range_trie::RangeTrie, |
| 19 | }, |
| 20 | util::{ |
| 21 | look::{Look, LookMatcher}, |
| 22 | primitives::{PatternID, StateID}, |
| 23 | }, |
| 24 | }; |
| 25 | |
| 26 | /// The configuration used for a Thompson NFA compiler. |
| 27 | #[derive (Clone, Debug, Default)] |
| 28 | pub struct Config { |
| 29 | utf8: Option<bool>, |
| 30 | reverse: Option<bool>, |
| 31 | nfa_size_limit: Option<Option<usize>>, |
| 32 | shrink: Option<bool>, |
| 33 | which_captures: Option<WhichCaptures>, |
| 34 | look_matcher: Option<LookMatcher>, |
| 35 | #[cfg (test)] |
| 36 | unanchored_prefix: Option<bool>, |
| 37 | } |
| 38 | |
| 39 | impl Config { |
| 40 | /// Return a new default Thompson NFA compiler configuration. |
| 41 | pub fn new() -> Config { |
| 42 | Config::default() |
| 43 | } |
| 44 | |
| 45 | /// Whether to enable UTF-8 mode during search or not. |
| 46 | /// |
| 47 | /// A regex engine is said to be in UTF-8 mode when it guarantees that |
| 48 | /// all matches returned by it have spans consisting of only valid UTF-8. |
| 49 | /// That is, it is impossible for a match span to be returned that |
| 50 | /// contains any invalid UTF-8. |
| 51 | /// |
| 52 | /// UTF-8 mode generally consists of two things: |
| 53 | /// |
| 54 | /// 1. Whether the NFA's states are constructed such that all paths to a |
| 55 | /// match state that consume at least one byte always correspond to valid |
| 56 | /// UTF-8. |
| 57 | /// 2. Whether all paths to a match state that do _not_ consume any bytes |
| 58 | /// should always correspond to valid UTF-8 boundaries. |
| 59 | /// |
| 60 | /// (1) is a guarantee made by whoever constructs the NFA. |
| 61 | /// If you're parsing a regex from its concrete syntax, then |
| 62 | /// [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) can make |
| 63 | /// this guarantee for you. It does it by returning an error if the regex |
| 64 | /// pattern could every report a non-empty match span that contains invalid |
| 65 | /// UTF-8. So long as `syntax::Config::utf8` mode is enabled and your regex |
| 66 | /// successfully parses, then you're guaranteed that the corresponding NFA |
| 67 | /// will only ever report non-empty match spans containing valid UTF-8. |
| 68 | /// |
| 69 | /// (2) is a trickier guarantee because it cannot be enforced by the NFA |
| 70 | /// state graph itself. Consider, for example, the regex `a*`. It matches |
| 71 | /// the empty strings in `ā` at positions `0`, `1`, `2` and `3`, where |
| 72 | /// positions `1` and `2` occur within the UTF-8 encoding of a codepoint, |
| 73 | /// and thus correspond to invalid UTF-8 boundaries. Therefore, this |
| 74 | /// guarantee must be made at a higher level than the NFA state graph |
| 75 | /// itself. This crate deals with this case in each regex engine. Namely, |
| 76 | /// when a zero-width match that splits a codepoint is found and UTF-8 |
| 77 | /// mode enabled, then it is ignored and the engine moves on looking for |
| 78 | /// the next match. |
| 79 | /// |
| 80 | /// Thus, UTF-8 mode is both a promise that the NFA built only reports |
| 81 | /// non-empty matches that are valid UTF-8, and an *instruction* to regex |
| 82 | /// engines that empty matches that split codepoints should be banned. |
| 83 | /// |
| 84 | /// Because UTF-8 mode is fundamentally about avoiding invalid UTF-8 spans, |
| 85 | /// it only makes sense to enable this option when you *know* your haystack |
| 86 | /// is valid UTF-8. (For example, a `&str`.) Enabling UTF-8 mode and |
| 87 | /// searching a haystack that contains invalid UTF-8 leads to **unspecified |
| 88 | /// behavior**. |
| 89 | /// |
| 90 | /// Therefore, it may make sense to enable `syntax::Config::utf8` while |
| 91 | /// simultaneously *disabling* this option. That would ensure all non-empty |
| 92 | /// match spans are valid UTF-8, but that empty match spans may still split |
| 93 | /// a codepoint or match at other places that aren't valid UTF-8. |
| 94 | /// |
| 95 | /// In general, this mode is only relevant if your regex can match the |
| 96 | /// empty string. Most regexes don't. |
| 97 | /// |
| 98 | /// This is enabled by default. |
| 99 | /// |
| 100 | /// # Example |
| 101 | /// |
| 102 | /// This example shows how UTF-8 mode can impact the match spans that may |
| 103 | /// be reported in certain cases. |
| 104 | /// |
| 105 | /// ``` |
| 106 | /// use regex_automata::{ |
| 107 | /// nfa::thompson::{self, pikevm::PikeVM}, |
| 108 | /// Match, Input, |
| 109 | /// }; |
| 110 | /// |
| 111 | /// let re = PikeVM::new("" )?; |
| 112 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
| 113 | /// |
| 114 | /// // UTF-8 mode is enabled by default. |
| 115 | /// let mut input = Input::new("ā" ); |
| 116 | /// re.search(&mut cache, &input, &mut caps); |
| 117 | /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match()); |
| 118 | /// |
| 119 | /// // Even though an empty regex matches at 1..1, our next match is |
| 120 | /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is |
| 121 | /// // three bytes long). |
| 122 | /// input.set_start(1); |
| 123 | /// re.search(&mut cache, &input, &mut caps); |
| 124 | /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); |
| 125 | /// |
| 126 | /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2: |
| 127 | /// let re = PikeVM::builder() |
| 128 | /// .thompson(thompson::Config::new().utf8(false)) |
| 129 | /// .build("" )?; |
| 130 | /// re.search(&mut cache, &input, &mut caps); |
| 131 | /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match()); |
| 132 | /// |
| 133 | /// input.set_start(2); |
| 134 | /// re.search(&mut cache, &input, &mut caps); |
| 135 | /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match()); |
| 136 | /// |
| 137 | /// input.set_start(3); |
| 138 | /// re.search(&mut cache, &input, &mut caps); |
| 139 | /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); |
| 140 | /// |
| 141 | /// input.set_start(4); |
| 142 | /// re.search(&mut cache, &input, &mut caps); |
| 143 | /// assert_eq!(None, caps.get_match()); |
| 144 | /// |
| 145 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 146 | /// ``` |
| 147 | pub fn utf8(mut self, yes: bool) -> Config { |
| 148 | self.utf8 = Some(yes); |
| 149 | self |
| 150 | } |
| 151 | |
| 152 | /// Reverse the NFA. |
| 153 | /// |
| 154 | /// A NFA reversal is performed by reversing all of the concatenated |
| 155 | /// sub-expressions in the original pattern, recursively. (Look around |
| 156 | /// operators are also inverted.) The resulting NFA can be used to match |
| 157 | /// the pattern starting from the end of a string instead of the beginning |
| 158 | /// of a string. |
| 159 | /// |
| 160 | /// Reversing the NFA is useful for building a reverse DFA, which is most |
| 161 | /// useful for finding the start of a match after its ending position has |
| 162 | /// been found. NFA execution engines typically do not work on reverse |
| 163 | /// NFAs. For example, currently, the Pike VM reports the starting location |
| 164 | /// of matches without a reverse NFA. |
| 165 | /// |
| 166 | /// Currently, enabling this setting requires disabling the |
| 167 | /// [`captures`](Config::captures) setting. If both are enabled, then the |
| 168 | /// compiler will return an error. It is expected that this limitation will |
| 169 | /// be lifted in the future. |
| 170 | /// |
| 171 | /// This is disabled by default. |
| 172 | /// |
| 173 | /// # Example |
| 174 | /// |
| 175 | /// This example shows how to build a DFA from a reverse NFA, and then use |
| 176 | /// the DFA to search backwards. |
| 177 | /// |
| 178 | /// ``` |
| 179 | /// use regex_automata::{ |
| 180 | /// dfa::{self, Automaton}, |
| 181 | /// nfa::thompson::{NFA, WhichCaptures}, |
| 182 | /// HalfMatch, Input, |
| 183 | /// }; |
| 184 | /// |
| 185 | /// let dfa = dfa::dense::Builder::new() |
| 186 | /// .thompson(NFA::config() |
| 187 | /// .which_captures(WhichCaptures::None) |
| 188 | /// .reverse(true) |
| 189 | /// ) |
| 190 | /// .build("baz[0-9]+" )?; |
| 191 | /// let expected = Some(HalfMatch::must(0, 3)); |
| 192 | /// assert_eq!( |
| 193 | /// expected, |
| 194 | /// dfa.try_search_rev(&Input::new("foobaz12345bar" ))?, |
| 195 | /// ); |
| 196 | /// |
| 197 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 198 | /// ``` |
| 199 | pub fn reverse(mut self, yes: bool) -> Config { |
| 200 | self.reverse = Some(yes); |
| 201 | self |
| 202 | } |
| 203 | |
| 204 | /// Sets an approximate size limit on the total heap used by the NFA being |
| 205 | /// compiled. |
| 206 | /// |
| 207 | /// This permits imposing constraints on the size of a compiled NFA. This |
| 208 | /// may be useful in contexts where the regex pattern is untrusted and one |
| 209 | /// wants to avoid using too much memory. |
| 210 | /// |
| 211 | /// This size limit does not apply to auxiliary heap used during |
| 212 | /// compilation that is not part of the built NFA. |
| 213 | /// |
| 214 | /// Note that this size limit is applied during compilation in order for |
| 215 | /// the limit to prevent too much heap from being used. However, the |
| 216 | /// implementation may use an intermediate NFA representation that is |
| 217 | /// otherwise slightly bigger than the final public form. Since the size |
| 218 | /// limit may be applied to an intermediate representation, there is not |
| 219 | /// necessarily a precise correspondence between the configured size limit |
| 220 | /// and the heap usage of the final NFA. |
| 221 | /// |
| 222 | /// There is no size limit by default. |
| 223 | /// |
| 224 | /// # Example |
| 225 | /// |
| 226 | /// This example demonstrates how Unicode mode can greatly increase the |
| 227 | /// size of the NFA. |
| 228 | /// |
| 229 | /// ``` |
| 230 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 231 | /// use regex_automata::nfa::thompson::NFA; |
| 232 | /// |
| 233 | /// // 400KB isn't enough! |
| 234 | /// NFA::compiler() |
| 235 | /// .configure(NFA::config().nfa_size_limit(Some(400_000))) |
| 236 | /// .build(r"\w{20}" ) |
| 237 | /// .unwrap_err(); |
| 238 | /// |
| 239 | /// // ... but 500KB probably is. |
| 240 | /// let nfa = NFA::compiler() |
| 241 | /// .configure(NFA::config().nfa_size_limit(Some(500_000))) |
| 242 | /// .build(r"\w{20}" )?; |
| 243 | /// |
| 244 | /// assert_eq!(nfa.pattern_len(), 1); |
| 245 | /// |
| 246 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 247 | /// ``` |
| 248 | pub fn nfa_size_limit(mut self, bytes: Option<usize>) -> Config { |
| 249 | self.nfa_size_limit = Some(bytes); |
| 250 | self |
| 251 | } |
| 252 | |
| 253 | /// Apply best effort heuristics to shrink the NFA at the expense of more |
| 254 | /// time/memory. |
| 255 | /// |
| 256 | /// Generally speaking, if one is using an NFA to compile a DFA, then the |
| 257 | /// extra time used to shrink the NFA will be more than made up for during |
| 258 | /// DFA construction (potentially by a lot). In other words, enabling this |
| 259 | /// can substantially decrease the overall amount of time it takes to build |
| 260 | /// a DFA. |
| 261 | /// |
| 262 | /// A reason to keep this disabled is if you want to compile an NFA and |
| 263 | /// start using it as quickly as possible without needing to build a DFA, |
| 264 | /// and you don't mind using a bit of extra memory for the NFA. e.g., for |
| 265 | /// an NFA simulation or for a lazy DFA. |
| 266 | /// |
| 267 | /// NFA shrinking is currently most useful when compiling a reverse |
| 268 | /// NFA with large Unicode character classes. In particular, it trades |
| 269 | /// additional CPU time during NFA compilation in favor of generating fewer |
| 270 | /// NFA states. |
| 271 | /// |
| 272 | /// This is disabled by default because it can increase compile times |
| 273 | /// quite a bit if you aren't building a full DFA. |
| 274 | /// |
| 275 | /// # Example |
| 276 | /// |
| 277 | /// This example shows that NFA shrinking can lead to substantial space |
| 278 | /// savings in some cases. Notice that, as noted above, we build a reverse |
| 279 | /// DFA and use a pattern with a large Unicode character class. |
| 280 | /// |
| 281 | /// ``` |
| 282 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 283 | /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; |
| 284 | /// |
| 285 | /// // Currently we have to disable captures when enabling reverse NFA. |
| 286 | /// let config = NFA::config() |
| 287 | /// .which_captures(WhichCaptures::None) |
| 288 | /// .reverse(true); |
| 289 | /// let not_shrunk = NFA::compiler() |
| 290 | /// .configure(config.clone().shrink(false)) |
| 291 | /// .build(r"\w" )?; |
| 292 | /// let shrunk = NFA::compiler() |
| 293 | /// .configure(config.clone().shrink(true)) |
| 294 | /// .build(r"\w" )?; |
| 295 | /// |
| 296 | /// // While a specific shrink factor is not guaranteed, the savings can be |
| 297 | /// // considerable in some cases. |
| 298 | /// assert!(shrunk.states().len() * 2 < not_shrunk.states().len()); |
| 299 | /// |
| 300 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 301 | /// ``` |
| 302 | pub fn shrink(mut self, yes: bool) -> Config { |
| 303 | self.shrink = Some(yes); |
| 304 | self |
| 305 | } |
| 306 | |
| 307 | /// Whether to include 'Capture' states in the NFA. |
| 308 | /// |
| 309 | /// Currently, enabling this setting requires disabling the |
| 310 | /// [`reverse`](Config::reverse) setting. If both are enabled, then the |
| 311 | /// compiler will return an error. It is expected that this limitation will |
| 312 | /// be lifted in the future. |
| 313 | /// |
| 314 | /// This is enabled by default. |
| 315 | /// |
| 316 | /// # Example |
| 317 | /// |
| 318 | /// This example demonstrates that some regex engines, like the Pike VM, |
| 319 | /// require capturing states to be present in the NFA to report match |
| 320 | /// offsets. |
| 321 | /// |
| 322 | /// (Note that since this method is deprecated, the example below uses |
| 323 | /// [`Config::which_captures`] to disable capture states.) |
| 324 | /// |
| 325 | /// ``` |
| 326 | /// use regex_automata::nfa::thompson::{ |
| 327 | /// pikevm::PikeVM, |
| 328 | /// NFA, |
| 329 | /// WhichCaptures, |
| 330 | /// }; |
| 331 | /// |
| 332 | /// let re = PikeVM::builder() |
| 333 | /// .thompson(NFA::config().which_captures(WhichCaptures::None)) |
| 334 | /// .build(r"[a-z]+" )?; |
| 335 | /// let mut cache = re.create_cache(); |
| 336 | /// |
| 337 | /// assert!(re.is_match(&mut cache, "abc" )); |
| 338 | /// assert_eq!(None, re.find(&mut cache, "abc" )); |
| 339 | /// |
| 340 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 341 | /// ``` |
| 342 | #[deprecated (since = "0.3.5" , note = "use which_captures instead" )] |
| 343 | pub fn captures(self, yes: bool) -> Config { |
| 344 | self.which_captures(if yes { |
| 345 | WhichCaptures::All |
| 346 | } else { |
| 347 | WhichCaptures::None |
| 348 | }) |
| 349 | } |
| 350 | |
| 351 | /// Configures what kinds of capture groups are compiled into |
| 352 | /// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a |
| 353 | /// Thompson NFA. |
| 354 | /// |
| 355 | /// Currently, using any option except for [`WhichCaptures::None`] requires |
| 356 | /// disabling the [`reverse`](Config::reverse) setting. If both are |
| 357 | /// enabled, then the compiler will return an error. It is expected that |
| 358 | /// this limitation will be lifted in the future. |
| 359 | /// |
| 360 | /// This is set to [`WhichCaptures::All`] by default. Callers may wish to |
| 361 | /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the |
| 362 | /// overhead of capture states for explicit groups. Usually this occurs |
| 363 | /// when one wants to use the `PikeVM` only for determining the overall |
| 364 | /// match. Otherwise, the `PikeVM` could use much more memory than is |
| 365 | /// necessary. |
| 366 | /// |
| 367 | /// # Example |
| 368 | /// |
| 369 | /// This example demonstrates that some regex engines, like the Pike VM, |
| 370 | /// require capturing states to be present in the NFA to report match |
| 371 | /// offsets. |
| 372 | /// |
| 373 | /// ``` |
| 374 | /// use regex_automata::nfa::thompson::{ |
| 375 | /// pikevm::PikeVM, |
| 376 | /// NFA, |
| 377 | /// WhichCaptures, |
| 378 | /// }; |
| 379 | /// |
| 380 | /// let re = PikeVM::builder() |
| 381 | /// .thompson(NFA::config().which_captures(WhichCaptures::None)) |
| 382 | /// .build(r"[a-z]+" )?; |
| 383 | /// let mut cache = re.create_cache(); |
| 384 | /// |
| 385 | /// assert!(re.is_match(&mut cache, "abc" )); |
| 386 | /// assert_eq!(None, re.find(&mut cache, "abc" )); |
| 387 | /// |
| 388 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 389 | /// ``` |
| 390 | /// |
| 391 | /// The same applies to the bounded backtracker: |
| 392 | /// |
| 393 | /// ``` |
| 394 | /// use regex_automata::nfa::thompson::{ |
| 395 | /// backtrack::BoundedBacktracker, |
| 396 | /// NFA, |
| 397 | /// WhichCaptures, |
| 398 | /// }; |
| 399 | /// |
| 400 | /// let re = BoundedBacktracker::builder() |
| 401 | /// .thompson(NFA::config().which_captures(WhichCaptures::None)) |
| 402 | /// .build(r"[a-z]+" )?; |
| 403 | /// let mut cache = re.create_cache(); |
| 404 | /// |
| 405 | /// assert!(re.try_is_match(&mut cache, "abc" )?); |
| 406 | /// assert_eq!(None, re.try_find(&mut cache, "abc" )?); |
| 407 | /// |
| 408 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 409 | /// ``` |
| 410 | pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config { |
| 411 | self.which_captures = Some(which_captures); |
| 412 | self |
| 413 | } |
| 414 | |
| 415 | /// Sets the look-around matcher that should be used with this NFA. |
| 416 | /// |
| 417 | /// A look-around matcher determines how to match look-around assertions. |
| 418 | /// In particular, some assertions are configurable. For example, the |
| 419 | /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed |
| 420 | /// from the default of `\n` to any other byte. |
| 421 | /// |
| 422 | /// # Example |
| 423 | /// |
| 424 | /// This shows how to change the line terminator for multi-line assertions. |
| 425 | /// |
| 426 | /// ``` |
| 427 | /// use regex_automata::{ |
| 428 | /// nfa::thompson::{self, pikevm::PikeVM}, |
| 429 | /// util::look::LookMatcher, |
| 430 | /// Match, Input, |
| 431 | /// }; |
| 432 | /// |
| 433 | /// let mut lookm = LookMatcher::new(); |
| 434 | /// lookm.set_line_terminator(b' \x00' ); |
| 435 | /// |
| 436 | /// let re = PikeVM::builder() |
| 437 | /// .thompson(thompson::Config::new().look_matcher(lookm)) |
| 438 | /// .build(r"(?m)^[a-z]+$" )?; |
| 439 | /// let mut cache = re.create_cache(); |
| 440 | /// |
| 441 | /// // Multi-line assertions now use NUL as a terminator. |
| 442 | /// assert_eq!( |
| 443 | /// Some(Match::must(0, 1..4)), |
| 444 | /// re.find(&mut cache, b" \x00abc \x00" ), |
| 445 | /// ); |
| 446 | /// // ... and \n is no longer recognized as a terminator. |
| 447 | /// assert_eq!( |
| 448 | /// None, |
| 449 | /// re.find(&mut cache, b" \nabc \n" ), |
| 450 | /// ); |
| 451 | /// |
| 452 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 453 | /// ``` |
| 454 | pub fn look_matcher(mut self, m: LookMatcher) -> Config { |
| 455 | self.look_matcher = Some(m); |
| 456 | self |
| 457 | } |
| 458 | |
| 459 | /// Whether to compile an unanchored prefix into this NFA. |
| 460 | /// |
| 461 | /// This is enabled by default. It is made available for tests only to make |
| 462 | /// it easier to unit test the output of the compiler. |
| 463 | #[cfg (test)] |
| 464 | fn unanchored_prefix(mut self, yes: bool) -> Config { |
| 465 | self.unanchored_prefix = Some(yes); |
| 466 | self |
| 467 | } |
| 468 | |
| 469 | /// Returns whether this configuration has enabled UTF-8 mode. |
| 470 | pub fn get_utf8(&self) -> bool { |
| 471 | self.utf8.unwrap_or(true) |
| 472 | } |
| 473 | |
| 474 | /// Returns whether this configuration has enabled reverse NFA compilation. |
| 475 | pub fn get_reverse(&self) -> bool { |
| 476 | self.reverse.unwrap_or(false) |
| 477 | } |
| 478 | |
| 479 | /// Return the configured NFA size limit, if it exists, in the number of |
| 480 | /// bytes of heap used. |
| 481 | pub fn get_nfa_size_limit(&self) -> Option<usize> { |
| 482 | self.nfa_size_limit.unwrap_or(None) |
| 483 | } |
| 484 | |
| 485 | /// Return whether NFA shrinking is enabled. |
| 486 | pub fn get_shrink(&self) -> bool { |
| 487 | self.shrink.unwrap_or(false) |
| 488 | } |
| 489 | |
| 490 | /// Return whether NFA compilation is configured to produce capture states. |
| 491 | #[deprecated (since = "0.3.5" , note = "use get_which_captures instead" )] |
| 492 | pub fn get_captures(&self) -> bool { |
| 493 | self.get_which_captures().is_any() |
| 494 | } |
| 495 | |
| 496 | /// Return what kinds of capture states will be compiled into an NFA. |
| 497 | pub fn get_which_captures(&self) -> WhichCaptures { |
| 498 | self.which_captures.unwrap_or(WhichCaptures::All) |
| 499 | } |
| 500 | |
| 501 | /// Return the look-around matcher for this NFA. |
| 502 | pub fn get_look_matcher(&self) -> LookMatcher { |
| 503 | self.look_matcher.clone().unwrap_or(LookMatcher::default()) |
| 504 | } |
| 505 | |
| 506 | /// Return whether NFA compilation is configured to include an unanchored |
| 507 | /// prefix. |
| 508 | /// |
| 509 | /// This is always false when not in test mode. |
| 510 | fn get_unanchored_prefix(&self) -> bool { |
| 511 | #[cfg (test)] |
| 512 | { |
| 513 | self.unanchored_prefix.unwrap_or(true) |
| 514 | } |
| 515 | #[cfg (not(test))] |
| 516 | { |
| 517 | true |
| 518 | } |
| 519 | } |
| 520 | |
| 521 | /// Overwrite the default configuration such that the options in `o` are |
| 522 | /// always used. If an option in `o` is not set, then the corresponding |
| 523 | /// option in `self` is used. If it's not set in `self` either, then it |
| 524 | /// remains not set. |
| 525 | pub(crate) fn overwrite(&self, o: Config) -> Config { |
| 526 | Config { |
| 527 | utf8: o.utf8.or(self.utf8), |
| 528 | reverse: o.reverse.or(self.reverse), |
| 529 | nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit), |
| 530 | shrink: o.shrink.or(self.shrink), |
| 531 | which_captures: o.which_captures.or(self.which_captures), |
| 532 | look_matcher: o.look_matcher.or_else(|| self.look_matcher.clone()), |
| 533 | #[cfg (test)] |
| 534 | unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix), |
| 535 | } |
| 536 | } |
| 537 | } |
| 538 | |
| 539 | /// A configuration indicating which kinds of |
| 540 | /// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include. |
| 541 | /// |
| 542 | /// This configuration can be used with [`Config::which_captures`] to control |
| 543 | /// which capture states are compiled into a Thompson NFA. |
| 544 | /// |
| 545 | /// The default configuration is [`WhichCaptures::All`]. |
| 546 | #[derive (Clone, Copy, Debug)] |
| 547 | pub enum WhichCaptures { |
| 548 | /// All capture states, including those corresponding to both implicit and |
| 549 | /// explicit capture groups, are included in the Thompson NFA. |
| 550 | All, |
| 551 | /// Only capture states corresponding to implicit capture groups are |
| 552 | /// included. Implicit capture groups appear in every pattern implicitly |
| 553 | /// and correspond to the overall match of a pattern. |
| 554 | /// |
| 555 | /// This is useful when one only cares about the overall match of a |
| 556 | /// pattern. By excluding capture states from explicit capture groups, |
| 557 | /// one might be able to reduce the memory usage of a multi-pattern regex |
| 558 | /// substantially if it was otherwise written to have many explicit capture |
| 559 | /// groups. |
| 560 | Implicit, |
| 561 | /// No capture states are compiled into the Thompson NFA. |
| 562 | /// |
| 563 | /// This is useful when capture states are either not needed (for example, |
| 564 | /// if one is only trying to build a DFA) or if they aren't supported (for |
| 565 | /// example, a reverse NFA). |
| 566 | None, |
| 567 | } |
| 568 | |
| 569 | impl Default for WhichCaptures { |
| 570 | fn default() -> WhichCaptures { |
| 571 | WhichCaptures::All |
| 572 | } |
| 573 | } |
| 574 | |
| 575 | impl WhichCaptures { |
| 576 | /// Returns true if this configuration indicates that no capture states |
| 577 | /// should be produced in an NFA. |
| 578 | pub fn is_none(&self) -> bool { |
| 579 | matches!(*self, WhichCaptures::None) |
| 580 | } |
| 581 | |
| 582 | /// Returns true if this configuration indicates that some capture states |
| 583 | /// should be added to an NFA. Note that this might only include capture |
| 584 | /// states for implicit capture groups. |
| 585 | pub fn is_any(&self) -> bool { |
| 586 | !self.is_none() |
| 587 | } |
| 588 | } |
| 589 | |
| 590 | /* |
| 591 | This compiler below uses Thompson's construction algorithm. The compiler takes |
| 592 | a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph |
| 593 | is structured in a way that permits it to be executed by a virtual machine and |
| 594 | also used to efficiently build a DFA. |
| 595 | |
| 596 | The compiler deals with a slightly expanded set of NFA states than what is |
| 597 | in a final NFA (as exhibited by builder::State and nfa::State). Notably a |
| 598 | compiler state includes an empty node that has exactly one unconditional |
| 599 | epsilon transition to the next state. In other words, it's a "goto" instruction |
| 600 | if one views Thompson's NFA as a set of bytecode instructions. These goto |
| 601 | instructions are removed in a subsequent phase before returning the NFA to the |
| 602 | caller. The purpose of these empty nodes is that they make the construction |
| 603 | algorithm substantially simpler to implement. We remove them before returning |
| 604 | to the caller because they can represent substantial overhead when traversing |
| 605 | the NFA graph (either while searching using the NFA directly or while building |
| 606 | a DFA). |
| 607 | |
| 608 | In the future, it would be nice to provide a Glushkov compiler as well, as it |
| 609 | would work well as a bit-parallel NFA for smaller regexes. But the Thompson |
| 610 | construction is one I'm more familiar with and seems more straight-forward to |
| 611 | deal with when it comes to large Unicode character classes. |
| 612 | |
| 613 | Internally, the compiler uses interior mutability to improve composition in the |
| 614 | face of the borrow checker. In particular, we'd really like to be able to write |
| 615 | things like this: |
| 616 | |
| 617 | self.c_concat(exprs.iter().map(|e| self.c(e))) |
| 618 | |
| 619 | Which elegantly uses iterators to build up a sequence of compiled regex |
| 620 | sub-expressions and then hands it off to the concatenating compiler routine. |
| 621 | Without interior mutability, the borrow checker won't let us borrow `self` |
| 622 | mutably both inside and outside the closure at the same time. |
| 623 | */ |
| 624 | |
| 625 | /// A builder for compiling an NFA from a regex's high-level intermediate |
| 626 | /// representation (HIR). |
| 627 | /// |
| 628 | /// This compiler provides a way to translate a parsed regex pattern into an |
| 629 | /// NFA state graph. The NFA state graph can either be used directly to execute |
| 630 | /// a search (e.g., with a Pike VM), or it can be further used to build a DFA. |
| 631 | /// |
| 632 | /// This compiler provides APIs both for compiling regex patterns directly from |
| 633 | /// their concrete syntax, or via a [`regex_syntax::hir::Hir`]. |
| 634 | /// |
| 635 | /// This compiler has various options that may be configured via |
| 636 | /// [`thompson::Config`](Config). |
| 637 | /// |
| 638 | /// Note that a compiler is not the same as a [`thompson::Builder`](Builder). |
| 639 | /// A `Builder` provides a lower level API that is uncoupled from a regex |
| 640 | /// pattern's concrete syntax or even its HIR. Instead, it permits stitching |
| 641 | /// together an NFA by hand. See its docs for examples. |
| 642 | /// |
| 643 | /// # Example: compilation from concrete syntax |
| 644 | /// |
| 645 | /// This shows how to compile an NFA from a pattern string while setting a size |
| 646 | /// limit on how big the NFA is allowed to be (in terms of bytes of heap used). |
| 647 | /// |
| 648 | /// ``` |
| 649 | /// use regex_automata::{ |
| 650 | /// nfa::thompson::{NFA, pikevm::PikeVM}, |
| 651 | /// Match, |
| 652 | /// }; |
| 653 | /// |
| 654 | /// let config = NFA::config().nfa_size_limit(Some(1_000)); |
| 655 | /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w" )?; |
| 656 | /// |
| 657 | /// let re = PikeVM::new_from_nfa(nfa)?; |
| 658 | /// let mut cache = re.create_cache(); |
| 659 | /// let mut caps = re.create_captures(); |
| 660 | /// let expected = Some(Match::must(0, 3..4)); |
| 661 | /// re.captures(&mut cache, "!@#A#@!" , &mut caps); |
| 662 | /// assert_eq!(expected, caps.get_match()); |
| 663 | /// |
| 664 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 665 | /// ``` |
| 666 | /// |
| 667 | /// # Example: compilation from HIR |
| 668 | /// |
| 669 | /// This shows how to hand assemble a regular expression via its HIR, and then |
| 670 | /// compile an NFA directly from it. |
| 671 | /// |
| 672 | /// ``` |
| 673 | /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; |
| 674 | /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; |
| 675 | /// |
| 676 | /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ |
| 677 | /// ClassBytesRange::new(b'0' , b'9' ), |
| 678 | /// ClassBytesRange::new(b'A' , b'Z' ), |
| 679 | /// ClassBytesRange::new(b'_' , b'_' ), |
| 680 | /// ClassBytesRange::new(b'a' , b'z' ), |
| 681 | /// ]))); |
| 682 | /// |
| 683 | /// let config = NFA::config().nfa_size_limit(Some(1_000)); |
| 684 | /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; |
| 685 | /// |
| 686 | /// let re = PikeVM::new_from_nfa(nfa)?; |
| 687 | /// let mut cache = re.create_cache(); |
| 688 | /// let mut caps = re.create_captures(); |
| 689 | /// let expected = Some(Match::must(0, 3..4)); |
| 690 | /// re.captures(&mut cache, "!@#A#@!" , &mut caps); |
| 691 | /// assert_eq!(expected, caps.get_match()); |
| 692 | /// |
| 693 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 694 | /// ``` |
| 695 | #[derive (Clone, Debug)] |
| 696 | pub struct Compiler { |
| 697 | /// A regex parser, used when compiling an NFA directly from a pattern |
| 698 | /// string. |
| 699 | parser: ParserBuilder, |
| 700 | /// The compiler configuration. |
| 701 | config: Config, |
| 702 | /// The builder for actually constructing an NFA. This provides a |
| 703 | /// convenient abstraction for writing a compiler. |
| 704 | builder: RefCell<Builder>, |
| 705 | /// State used for compiling character classes to UTF-8 byte automata. |
| 706 | /// State is not retained between character class compilations. This just |
| 707 | /// serves to amortize allocation to the extent possible. |
| 708 | utf8_state: RefCell<Utf8State>, |
| 709 | /// State used for arranging character classes in reverse into a trie. |
| 710 | trie_state: RefCell<RangeTrie>, |
| 711 | /// State used for caching common suffixes when compiling reverse UTF-8 |
| 712 | /// automata (for Unicode character classes). |
| 713 | utf8_suffix: RefCell<Utf8SuffixMap>, |
| 714 | } |
| 715 | |
| 716 | impl Compiler { |
| 717 | /// Create a new NFA builder with its default configuration. |
| 718 | pub fn new() -> Compiler { |
| 719 | Compiler { |
| 720 | parser: ParserBuilder::new(), |
| 721 | config: Config::default(), |
| 722 | builder: RefCell::new(Builder::new()), |
| 723 | utf8_state: RefCell::new(Utf8State::new()), |
| 724 | trie_state: RefCell::new(RangeTrie::new()), |
| 725 | utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), |
| 726 | } |
| 727 | } |
| 728 | |
| 729 | /// Compile the given regular expression pattern into an NFA. |
| 730 | /// |
| 731 | /// If there was a problem parsing the regex, then that error is returned. |
| 732 | /// |
| 733 | /// Otherwise, if there was a problem building the NFA, then an error is |
| 734 | /// returned. The only error that can occur is if the compiled regex would |
| 735 | /// exceed the size limits configured on this builder, or if any part of |
| 736 | /// the NFA would exceed the integer representations used. (For example, |
| 737 | /// too many states might plausibly occur on a 16-bit target.) |
| 738 | /// |
| 739 | /// # Example |
| 740 | /// |
| 741 | /// ``` |
| 742 | /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; |
| 743 | /// |
| 744 | /// let config = NFA::config().nfa_size_limit(Some(1_000)); |
| 745 | /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w" )?; |
| 746 | /// |
| 747 | /// let re = PikeVM::new_from_nfa(nfa)?; |
| 748 | /// let mut cache = re.create_cache(); |
| 749 | /// let mut caps = re.create_captures(); |
| 750 | /// let expected = Some(Match::must(0, 3..4)); |
| 751 | /// re.captures(&mut cache, "!@#A#@!" , &mut caps); |
| 752 | /// assert_eq!(expected, caps.get_match()); |
| 753 | /// |
| 754 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 755 | /// ``` |
| 756 | pub fn build(&self, pattern: &str) -> Result<NFA, BuildError> { |
| 757 | self.build_many(&[pattern]) |
| 758 | } |
| 759 | |
| 760 | /// Compile the given regular expression patterns into a single NFA. |
| 761 | /// |
| 762 | /// When matches are returned, the pattern ID corresponds to the index of |
| 763 | /// the pattern in the slice given. |
| 764 | /// |
| 765 | /// # Example |
| 766 | /// |
| 767 | /// ``` |
| 768 | /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; |
| 769 | /// |
| 770 | /// let config = NFA::config().nfa_size_limit(Some(1_000)); |
| 771 | /// let nfa = NFA::compiler().configure(config).build_many(&[ |
| 772 | /// r"(?-u)\s" , |
| 773 | /// r"(?-u)\w" , |
| 774 | /// ])?; |
| 775 | /// |
| 776 | /// let re = PikeVM::new_from_nfa(nfa)?; |
| 777 | /// let mut cache = re.create_cache(); |
| 778 | /// let mut caps = re.create_captures(); |
| 779 | /// let expected = Some(Match::must(1, 1..2)); |
| 780 | /// re.captures(&mut cache, "!A! !A!" , &mut caps); |
| 781 | /// assert_eq!(expected, caps.get_match()); |
| 782 | /// |
| 783 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 784 | /// ``` |
| 785 | pub fn build_many<P: AsRef<str>>( |
| 786 | &self, |
| 787 | patterns: &[P], |
| 788 | ) -> Result<NFA, BuildError> { |
| 789 | let mut hirs = vec![]; |
| 790 | for p in patterns { |
| 791 | hirs.push( |
| 792 | self.parser |
| 793 | .build() |
| 794 | .parse(p.as_ref()) |
| 795 | .map_err(BuildError::syntax)?, |
| 796 | ); |
| 797 | debug!("parsed: {:?}" , p.as_ref()); |
| 798 | } |
| 799 | self.build_many_from_hir(&hirs) |
| 800 | } |
| 801 | |
| 802 | /// Compile the given high level intermediate representation of a regular |
| 803 | /// expression into an NFA. |
| 804 | /// |
| 805 | /// If there was a problem building the NFA, then an error is returned. The |
| 806 | /// only error that can occur is if the compiled regex would exceed the |
| 807 | /// size limits configured on this builder, or if any part of the NFA would |
| 808 | /// exceed the integer representations used. (For example, too many states |
| 809 | /// might plausibly occur on a 16-bit target.) |
| 810 | /// |
| 811 | /// # Example |
| 812 | /// |
| 813 | /// ``` |
| 814 | /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; |
| 815 | /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; |
| 816 | /// |
| 817 | /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ |
| 818 | /// ClassBytesRange::new(b'0' , b'9' ), |
| 819 | /// ClassBytesRange::new(b'A' , b'Z' ), |
| 820 | /// ClassBytesRange::new(b'_' , b'_' ), |
| 821 | /// ClassBytesRange::new(b'a' , b'z' ), |
| 822 | /// ]))); |
| 823 | /// |
| 824 | /// let config = NFA::config().nfa_size_limit(Some(1_000)); |
| 825 | /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; |
| 826 | /// |
| 827 | /// let re = PikeVM::new_from_nfa(nfa)?; |
| 828 | /// let mut cache = re.create_cache(); |
| 829 | /// let mut caps = re.create_captures(); |
| 830 | /// let expected = Some(Match::must(0, 3..4)); |
| 831 | /// re.captures(&mut cache, "!@#A#@!" , &mut caps); |
| 832 | /// assert_eq!(expected, caps.get_match()); |
| 833 | /// |
| 834 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 835 | /// ``` |
| 836 | pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, BuildError> { |
| 837 | self.build_many_from_hir(&[expr]) |
| 838 | } |
| 839 | |
| 840 | /// Compile the given high level intermediate representations of regular |
| 841 | /// expressions into a single NFA. |
| 842 | /// |
| 843 | /// When matches are returned, the pattern ID corresponds to the index of |
| 844 | /// the pattern in the slice given. |
| 845 | /// |
| 846 | /// # Example |
| 847 | /// |
| 848 | /// ``` |
| 849 | /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; |
| 850 | /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; |
| 851 | /// |
| 852 | /// let hirs = &[ |
| 853 | /// Hir::class(Class::Bytes(ClassBytes::new(vec![ |
| 854 | /// ClassBytesRange::new(b' \t' , b' \r' ), |
| 855 | /// ClassBytesRange::new(b' ' , b' ' ), |
| 856 | /// ]))), |
| 857 | /// Hir::class(Class::Bytes(ClassBytes::new(vec![ |
| 858 | /// ClassBytesRange::new(b'0' , b'9' ), |
| 859 | /// ClassBytesRange::new(b'A' , b'Z' ), |
| 860 | /// ClassBytesRange::new(b'_' , b'_' ), |
| 861 | /// ClassBytesRange::new(b'a' , b'z' ), |
| 862 | /// ]))), |
| 863 | /// ]; |
| 864 | /// |
| 865 | /// let config = NFA::config().nfa_size_limit(Some(1_000)); |
| 866 | /// let nfa = NFA::compiler().configure(config).build_many_from_hir(hirs)?; |
| 867 | /// |
| 868 | /// let re = PikeVM::new_from_nfa(nfa)?; |
| 869 | /// let mut cache = re.create_cache(); |
| 870 | /// let mut caps = re.create_captures(); |
| 871 | /// let expected = Some(Match::must(1, 1..2)); |
| 872 | /// re.captures(&mut cache, "!A! !A!" , &mut caps); |
| 873 | /// assert_eq!(expected, caps.get_match()); |
| 874 | /// |
| 875 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 876 | /// ``` |
| 877 | pub fn build_many_from_hir<H: Borrow<Hir>>( |
| 878 | &self, |
| 879 | exprs: &[H], |
| 880 | ) -> Result<NFA, BuildError> { |
| 881 | self.compile(exprs) |
| 882 | } |
| 883 | |
| 884 | /// Apply the given NFA configuration options to this builder. |
| 885 | /// |
| 886 | /// # Example |
| 887 | /// |
| 888 | /// ``` |
| 889 | /// use regex_automata::nfa::thompson::NFA; |
| 890 | /// |
| 891 | /// let config = NFA::config().nfa_size_limit(Some(1_000)); |
| 892 | /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w" )?; |
| 893 | /// assert_eq!(nfa.pattern_len(), 1); |
| 894 | /// |
| 895 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 896 | /// ``` |
| 897 | pub fn configure(&mut self, config: Config) -> &mut Compiler { |
| 898 | self.config = self.config.overwrite(config); |
| 899 | self |
| 900 | } |
| 901 | |
| 902 | /// Set the syntax configuration for this builder using |
| 903 | /// [`syntax::Config`](crate::util::syntax::Config). |
| 904 | /// |
| 905 | /// This permits setting things like case insensitivity, Unicode and multi |
| 906 | /// line mode. |
| 907 | /// |
| 908 | /// This syntax configuration only applies when an NFA is built directly |
| 909 | /// from a pattern string. If an NFA is built from an HIR, then all syntax |
| 910 | /// settings are ignored. |
| 911 | /// |
| 912 | /// # Example |
| 913 | /// |
| 914 | /// ``` |
| 915 | /// use regex_automata::{nfa::thompson::NFA, util::syntax}; |
| 916 | /// |
| 917 | /// let syntax_config = syntax::Config::new().unicode(false); |
| 918 | /// let nfa = NFA::compiler().syntax(syntax_config).build(r"\w" )?; |
| 919 | /// // If Unicode were enabled, the number of states would be much bigger. |
| 920 | /// assert!(nfa.states().len() < 15); |
| 921 | /// |
| 922 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 923 | /// ``` |
| 924 | pub fn syntax( |
| 925 | &mut self, |
| 926 | config: crate::util::syntax::Config, |
| 927 | ) -> &mut Compiler { |
| 928 | config.apply(&mut self.parser); |
| 929 | self |
| 930 | } |
| 931 | } |
| 932 | |
| 933 | impl Compiler { |
| 934 | /// Compile the sequence of HIR expressions given. Pattern IDs are |
| 935 | /// allocated starting from 0, in correspondence with the slice given. |
| 936 | /// |
| 937 | /// It is legal to provide an empty slice. In that case, the NFA returned |
| 938 | /// has no patterns and will never match anything. |
| 939 | fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, BuildError> { |
| 940 | if exprs.len() > PatternID::LIMIT { |
| 941 | return Err(BuildError::too_many_patterns(exprs.len())); |
| 942 | } |
| 943 | if self.config.get_reverse() |
| 944 | && self.config.get_which_captures().is_any() |
| 945 | { |
| 946 | return Err(BuildError::unsupported_captures()); |
| 947 | } |
| 948 | |
| 949 | self.builder.borrow_mut().clear(); |
| 950 | self.builder.borrow_mut().set_utf8(self.config.get_utf8()); |
| 951 | self.builder.borrow_mut().set_reverse(self.config.get_reverse()); |
| 952 | self.builder |
| 953 | .borrow_mut() |
| 954 | .set_look_matcher(self.config.get_look_matcher()); |
| 955 | self.builder |
| 956 | .borrow_mut() |
| 957 | .set_size_limit(self.config.get_nfa_size_limit())?; |
| 958 | |
| 959 | // We always add an unanchored prefix unless we were specifically told |
| 960 | // not to (for tests only), or if we know that the regex is anchored |
| 961 | // for all matches. When an unanchored prefix is not added, then the |
| 962 | // NFA's anchored and unanchored start states are equivalent. |
| 963 | let all_anchored = exprs.iter().all(|e| { |
| 964 | let props = e.borrow().properties(); |
| 965 | if self.config.get_reverse() { |
| 966 | props.look_set_suffix().contains(hir::Look::End) |
| 967 | } else { |
| 968 | props.look_set_prefix().contains(hir::Look::Start) |
| 969 | } |
| 970 | }); |
| 971 | let anchored = !self.config.get_unanchored_prefix() || all_anchored; |
| 972 | let unanchored_prefix = if anchored { |
| 973 | self.c_empty()? |
| 974 | } else { |
| 975 | self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)? |
| 976 | }; |
| 977 | |
| 978 | let compiled = self.c_alt_iter(exprs.iter().map(|e| { |
| 979 | let _ = self.start_pattern()?; |
| 980 | let one = self.c_cap(0, None, e.borrow())?; |
| 981 | let match_state_id = self.add_match()?; |
| 982 | self.patch(one.end, match_state_id)?; |
| 983 | let _ = self.finish_pattern(one.start)?; |
| 984 | Ok(ThompsonRef { start: one.start, end: match_state_id }) |
| 985 | }))?; |
| 986 | self.patch(unanchored_prefix.end, compiled.start)?; |
| 987 | let nfa = self |
| 988 | .builder |
| 989 | .borrow_mut() |
| 990 | .build(compiled.start, unanchored_prefix.start)?; |
| 991 | |
| 992 | debug!("HIR-to-NFA compilation complete, config: {:?}" , self.config); |
| 993 | Ok(nfa) |
| 994 | } |
| 995 | |
| 996 | /// Compile an arbitrary HIR expression. |
| 997 | fn c(&self, expr: &Hir) -> Result<ThompsonRef, BuildError> { |
| 998 | use regex_syntax::hir::{Class, HirKind::*}; |
| 999 | |
| 1000 | match *expr.kind() { |
| 1001 | Empty => self.c_empty(), |
| 1002 | Literal(hir::Literal(ref bytes)) => self.c_literal(bytes), |
| 1003 | Class(Class::Bytes(ref c)) => self.c_byte_class(c), |
| 1004 | Class(Class::Unicode(ref c)) => self.c_unicode_class(c), |
| 1005 | Look(ref look) => self.c_look(look), |
| 1006 | Repetition(ref rep) => self.c_repetition(rep), |
| 1007 | Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub), |
| 1008 | Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))), |
| 1009 | Alternation(ref es) => self.c_alt_slice(es), |
| 1010 | } |
| 1011 | } |
| 1012 | |
| 1013 | /// Compile a concatenation of the sub-expressions yielded by the given |
| 1014 | /// iterator. If the iterator yields no elements, then this compiles down |
| 1015 | /// to an "empty" state that always matches. |
| 1016 | /// |
| 1017 | /// If the compiler is in reverse mode, then the expressions given are |
| 1018 | /// automatically compiled in reverse. |
| 1019 | fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError> |
| 1020 | where |
| 1021 | I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>, |
| 1022 | { |
| 1023 | let first = if self.is_reverse() { it.next_back() } else { it.next() }; |
| 1024 | let ThompsonRef { start, mut end } = match first { |
| 1025 | Some(result) => result?, |
| 1026 | None => return self.c_empty(), |
| 1027 | }; |
| 1028 | loop { |
| 1029 | let next = |
| 1030 | if self.is_reverse() { it.next_back() } else { it.next() }; |
| 1031 | let compiled = match next { |
| 1032 | Some(result) => result?, |
| 1033 | None => break, |
| 1034 | }; |
| 1035 | self.patch(end, compiled.start)?; |
| 1036 | end = compiled.end; |
| 1037 | } |
| 1038 | Ok(ThompsonRef { start, end }) |
| 1039 | } |
| 1040 | |
| 1041 | /// Compile an alternation of the given HIR values. |
| 1042 | /// |
| 1043 | /// This is like 'c_alt_iter', but it accepts a slice of HIR values instead |
| 1044 | /// of an iterator of compiled NFA subgraphs. The point of accepting a |
| 1045 | /// slice here is that it opens up some optimization opportunities. For |
| 1046 | /// example, if all of the HIR values are literals, then this routine might |
| 1047 | /// re-shuffle them to make NFA epsilon closures substantially faster. |
| 1048 | fn c_alt_slice(&self, exprs: &[Hir]) -> Result<ThompsonRef, BuildError> { |
| 1049 | // self.c_alt_iter(exprs.iter().map(|e| self.c(e))) |
| 1050 | let literal_count = exprs |
| 1051 | .iter() |
| 1052 | .filter(|e| { |
| 1053 | matches!(*e.kind(), hir::HirKind::Literal(hir::Literal(_))) |
| 1054 | }) |
| 1055 | .count(); |
| 1056 | if literal_count <= 1 || literal_count < exprs.len() { |
| 1057 | return self.c_alt_iter(exprs.iter().map(|e| self.c(e))); |
| 1058 | } |
| 1059 | |
| 1060 | let mut trie = if self.is_reverse() { |
| 1061 | LiteralTrie::reverse() |
| 1062 | } else { |
| 1063 | LiteralTrie::forward() |
| 1064 | }; |
| 1065 | for expr in exprs.iter() { |
| 1066 | let literal = match *expr.kind() { |
| 1067 | hir::HirKind::Literal(hir::Literal(ref bytes)) => bytes, |
| 1068 | _ => unreachable!(), |
| 1069 | }; |
| 1070 | trie.add(literal)?; |
| 1071 | } |
| 1072 | trie.compile(&mut self.builder.borrow_mut()) |
| 1073 | } |
| 1074 | |
| 1075 | /// Compile an alternation, where each element yielded by the given |
| 1076 | /// iterator represents an item in the alternation. If the iterator yields |
| 1077 | /// no elements, then this compiles down to a "fail" state. |
| 1078 | /// |
| 1079 | /// In an alternation, expressions appearing earlier are "preferred" at |
| 1080 | /// match time over expressions appearing later. At least, this is true |
| 1081 | /// when using "leftmost first" match semantics. (If "leftmost longest" are |
| 1082 | /// ever added in the future, then this preference order of priority would |
| 1083 | /// not apply in that mode.) |
| 1084 | fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError> |
| 1085 | where |
| 1086 | I: Iterator<Item = Result<ThompsonRef, BuildError>>, |
| 1087 | { |
| 1088 | let first = match it.next() { |
| 1089 | None => return self.c_fail(), |
| 1090 | Some(result) => result?, |
| 1091 | }; |
| 1092 | let second = match it.next() { |
| 1093 | None => return Ok(first), |
| 1094 | Some(result) => result?, |
| 1095 | }; |
| 1096 | |
| 1097 | let union = self.add_union()?; |
| 1098 | let end = self.add_empty()?; |
| 1099 | self.patch(union, first.start)?; |
| 1100 | self.patch(first.end, end)?; |
| 1101 | self.patch(union, second.start)?; |
| 1102 | self.patch(second.end, end)?; |
| 1103 | for result in it { |
| 1104 | let compiled = result?; |
| 1105 | self.patch(union, compiled.start)?; |
| 1106 | self.patch(compiled.end, end)?; |
| 1107 | } |
| 1108 | Ok(ThompsonRef { start: union, end }) |
| 1109 | } |
| 1110 | |
| 1111 | /// Compile the given capture sub-expression. `expr` should be the |
| 1112 | /// sub-expression contained inside the capture. If "capture" states are |
| 1113 | /// enabled, then they are added as appropriate. |
| 1114 | /// |
| 1115 | /// This accepts the pieces of a capture instead of a `hir::Capture` so |
| 1116 | /// that it's easy to manufacture a "fake" group when necessary, e.g., for |
| 1117 | /// adding the entire pattern as if it were a group in order to create |
| 1118 | /// appropriate "capture" states in the NFA. |
| 1119 | fn c_cap( |
| 1120 | &self, |
| 1121 | index: u32, |
| 1122 | name: Option<&str>, |
| 1123 | expr: &Hir, |
| 1124 | ) -> Result<ThompsonRef, BuildError> { |
| 1125 | match self.config.get_which_captures() { |
| 1126 | // No capture states means we always skip them. |
| 1127 | WhichCaptures::None => return self.c(expr), |
| 1128 | // Implicit captures states means we only add when index==0 since |
| 1129 | // index==0 implies the group is implicit. |
| 1130 | WhichCaptures::Implicit if index > 0 => return self.c(expr), |
| 1131 | _ => {} |
| 1132 | } |
| 1133 | |
| 1134 | let start = self.add_capture_start(index, name)?; |
| 1135 | let inner = self.c(expr)?; |
| 1136 | let end = self.add_capture_end(index)?; |
| 1137 | self.patch(start, inner.start)?; |
| 1138 | self.patch(inner.end, end)?; |
| 1139 | Ok(ThompsonRef { start, end }) |
| 1140 | } |
| 1141 | |
| 1142 | /// Compile the given repetition expression. This handles all types of |
| 1143 | /// repetitions and greediness. |
| 1144 | fn c_repetition( |
| 1145 | &self, |
| 1146 | rep: &hir::Repetition, |
| 1147 | ) -> Result<ThompsonRef, BuildError> { |
| 1148 | match (rep.min, rep.max) { |
| 1149 | (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy), |
| 1150 | (min, None) => self.c_at_least(&rep.sub, rep.greedy, min), |
| 1151 | (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min), |
| 1152 | (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max), |
| 1153 | } |
| 1154 | } |
| 1155 | |
| 1156 | /// Compile the given expression such that it matches at least `min` times, |
| 1157 | /// but no more than `max` times. |
| 1158 | /// |
| 1159 | /// When `greedy` is true, then the preference is for the expression to |
| 1160 | /// match as much as possible. Otherwise, it will match as little as |
| 1161 | /// possible. |
| 1162 | fn c_bounded( |
| 1163 | &self, |
| 1164 | expr: &Hir, |
| 1165 | greedy: bool, |
| 1166 | min: u32, |
| 1167 | max: u32, |
| 1168 | ) -> Result<ThompsonRef, BuildError> { |
| 1169 | let prefix = self.c_exactly(expr, min)?; |
| 1170 | if min == max { |
| 1171 | return Ok(prefix); |
| 1172 | } |
| 1173 | |
| 1174 | // It is tempting here to compile the rest here as a concatenation |
| 1175 | // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it |
| 1176 | // were `aaa?a?a?`. The problem here is that it leads to this program: |
| 1177 | // |
| 1178 | // >000000: 61 => 01 |
| 1179 | // 000001: 61 => 02 |
| 1180 | // 000002: union(03, 04) |
| 1181 | // 000003: 61 => 04 |
| 1182 | // 000004: union(05, 06) |
| 1183 | // 000005: 61 => 06 |
| 1184 | // 000006: union(07, 08) |
| 1185 | // 000007: 61 => 08 |
| 1186 | // 000008: MATCH |
| 1187 | // |
| 1188 | // And effectively, once you hit state 2, the epsilon closure will |
| 1189 | // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better |
| 1190 | // to instead compile it like so: |
| 1191 | // |
| 1192 | // >000000: 61 => 01 |
| 1193 | // 000001: 61 => 02 |
| 1194 | // 000002: union(03, 08) |
| 1195 | // 000003: 61 => 04 |
| 1196 | // 000004: union(05, 08) |
| 1197 | // 000005: 61 => 06 |
| 1198 | // 000006: union(07, 08) |
| 1199 | // 000007: 61 => 08 |
| 1200 | // 000008: MATCH |
| 1201 | // |
| 1202 | // So that the epsilon closure of state 2 is now just 3 and 8. |
| 1203 | let empty = self.add_empty()?; |
| 1204 | let mut prev_end = prefix.end; |
| 1205 | for _ in min..max { |
| 1206 | let union = if greedy { |
| 1207 | self.add_union() |
| 1208 | } else { |
| 1209 | self.add_union_reverse() |
| 1210 | }?; |
| 1211 | let compiled = self.c(expr)?; |
| 1212 | self.patch(prev_end, union)?; |
| 1213 | self.patch(union, compiled.start)?; |
| 1214 | self.patch(union, empty)?; |
| 1215 | prev_end = compiled.end; |
| 1216 | } |
| 1217 | self.patch(prev_end, empty)?; |
| 1218 | Ok(ThompsonRef { start: prefix.start, end: empty }) |
| 1219 | } |
| 1220 | |
| 1221 | /// Compile the given expression such that it may be matched `n` or more |
| 1222 | /// times, where `n` can be any integer. (Although a particularly large |
| 1223 | /// integer is likely to run afoul of any configured size limits.) |
| 1224 | /// |
| 1225 | /// When `greedy` is true, then the preference is for the expression to |
| 1226 | /// match as much as possible. Otherwise, it will match as little as |
| 1227 | /// possible. |
| 1228 | fn c_at_least( |
| 1229 | &self, |
| 1230 | expr: &Hir, |
| 1231 | greedy: bool, |
| 1232 | n: u32, |
| 1233 | ) -> Result<ThompsonRef, BuildError> { |
| 1234 | if n == 0 { |
| 1235 | // When the expression cannot match the empty string, then we |
| 1236 | // can get away with something much simpler: just one 'alt' |
| 1237 | // instruction that optionally repeats itself. But if the expr |
| 1238 | // can match the empty string... see below. |
| 1239 | if expr.properties().minimum_len().map_or(false, |len| len > 0) { |
| 1240 | let union = if greedy { |
| 1241 | self.add_union() |
| 1242 | } else { |
| 1243 | self.add_union_reverse() |
| 1244 | }?; |
| 1245 | let compiled = self.c(expr)?; |
| 1246 | self.patch(union, compiled.start)?; |
| 1247 | self.patch(compiled.end, union)?; |
| 1248 | return Ok(ThompsonRef { start: union, end: union }); |
| 1249 | } |
| 1250 | |
| 1251 | // What's going on here? Shouldn't x* be simpler than this? It |
| 1252 | // turns out that when implementing leftmost-first (Perl-like) |
| 1253 | // match semantics, x* results in an incorrect preference order |
| 1254 | // when computing the transitive closure of states if and only if |
| 1255 | // 'x' can match the empty string. So instead, we compile x* as |
| 1256 | // (x+)?, which preserves the correct preference order. |
| 1257 | // |
| 1258 | // See: https://github.com/rust-lang/regex/issues/779 |
| 1259 | let compiled = self.c(expr)?; |
| 1260 | let plus = if greedy { |
| 1261 | self.add_union() |
| 1262 | } else { |
| 1263 | self.add_union_reverse() |
| 1264 | }?; |
| 1265 | self.patch(compiled.end, plus)?; |
| 1266 | self.patch(plus, compiled.start)?; |
| 1267 | |
| 1268 | let question = if greedy { |
| 1269 | self.add_union() |
| 1270 | } else { |
| 1271 | self.add_union_reverse() |
| 1272 | }?; |
| 1273 | let empty = self.add_empty()?; |
| 1274 | self.patch(question, compiled.start)?; |
| 1275 | self.patch(question, empty)?; |
| 1276 | self.patch(plus, empty)?; |
| 1277 | Ok(ThompsonRef { start: question, end: empty }) |
| 1278 | } else if n == 1 { |
| 1279 | let compiled = self.c(expr)?; |
| 1280 | let union = if greedy { |
| 1281 | self.add_union() |
| 1282 | } else { |
| 1283 | self.add_union_reverse() |
| 1284 | }?; |
| 1285 | self.patch(compiled.end, union)?; |
| 1286 | self.patch(union, compiled.start)?; |
| 1287 | Ok(ThompsonRef { start: compiled.start, end: union }) |
| 1288 | } else { |
| 1289 | let prefix = self.c_exactly(expr, n - 1)?; |
| 1290 | let last = self.c(expr)?; |
| 1291 | let union = if greedy { |
| 1292 | self.add_union() |
| 1293 | } else { |
| 1294 | self.add_union_reverse() |
| 1295 | }?; |
| 1296 | self.patch(prefix.end, last.start)?; |
| 1297 | self.patch(last.end, union)?; |
| 1298 | self.patch(union, last.start)?; |
| 1299 | Ok(ThompsonRef { start: prefix.start, end: union }) |
| 1300 | } |
| 1301 | } |
| 1302 | |
| 1303 | /// Compile the given expression such that it may be matched zero or one |
| 1304 | /// times. |
| 1305 | /// |
| 1306 | /// When `greedy` is true, then the preference is for the expression to |
| 1307 | /// match as much as possible. Otherwise, it will match as little as |
| 1308 | /// possible. |
| 1309 | fn c_zero_or_one( |
| 1310 | &self, |
| 1311 | expr: &Hir, |
| 1312 | greedy: bool, |
| 1313 | ) -> Result<ThompsonRef, BuildError> { |
| 1314 | let union = |
| 1315 | if greedy { self.add_union() } else { self.add_union_reverse() }?; |
| 1316 | let compiled = self.c(expr)?; |
| 1317 | let empty = self.add_empty()?; |
| 1318 | self.patch(union, compiled.start)?; |
| 1319 | self.patch(union, empty)?; |
| 1320 | self.patch(compiled.end, empty)?; |
| 1321 | Ok(ThompsonRef { start: union, end: empty }) |
| 1322 | } |
| 1323 | |
| 1324 | /// Compile the given HIR expression exactly `n` times. |
| 1325 | fn c_exactly( |
| 1326 | &self, |
| 1327 | expr: &Hir, |
| 1328 | n: u32, |
| 1329 | ) -> Result<ThompsonRef, BuildError> { |
| 1330 | let it = (0..n).map(|_| self.c(expr)); |
| 1331 | self.c_concat(it) |
| 1332 | } |
| 1333 | |
| 1334 | /// Compile the given byte oriented character class. |
| 1335 | /// |
| 1336 | /// This uses "sparse" states to represent an alternation between ranges in |
| 1337 | /// this character class. We can use "sparse" states instead of stitching |
| 1338 | /// together a "union" state because all ranges in a character class have |
| 1339 | /// equal priority *and* are non-overlapping (thus, only one can match, so |
| 1340 | /// there's never a question of priority in the first place). This saves a |
| 1341 | /// fair bit of overhead when traversing an NFA. |
| 1342 | /// |
| 1343 | /// This routine compiles an empty character class into a "fail" state. |
| 1344 | fn c_byte_class( |
| 1345 | &self, |
| 1346 | cls: &hir::ClassBytes, |
| 1347 | ) -> Result<ThompsonRef, BuildError> { |
| 1348 | let end = self.add_empty()?; |
| 1349 | let mut trans = Vec::with_capacity(cls.ranges().len()); |
| 1350 | for r in cls.iter() { |
| 1351 | trans.push(Transition { |
| 1352 | start: r.start(), |
| 1353 | end: r.end(), |
| 1354 | next: end, |
| 1355 | }); |
| 1356 | } |
| 1357 | Ok(ThompsonRef { start: self.add_sparse(trans)?, end }) |
| 1358 | } |
| 1359 | |
| 1360 | /// Compile the given Unicode character class. |
| 1361 | /// |
| 1362 | /// This routine specifically tries to use various types of compression, |
| 1363 | /// since UTF-8 automata of large classes can get quite large. The specific |
| 1364 | /// type of compression used depends on forward vs reverse compilation, and |
| 1365 | /// whether NFA shrinking is enabled or not. |
| 1366 | /// |
| 1367 | /// Aside from repetitions causing lots of repeat group, this is like the |
| 1368 | /// single most expensive part of regex compilation. Therefore, a large part |
| 1369 | /// of the expense of compilation may be reduce by disabling Unicode in the |
| 1370 | /// pattern. |
| 1371 | /// |
| 1372 | /// This routine compiles an empty character class into a "fail" state. |
| 1373 | fn c_unicode_class( |
| 1374 | &self, |
| 1375 | cls: &hir::ClassUnicode, |
| 1376 | ) -> Result<ThompsonRef, BuildError> { |
| 1377 | // If all we have are ASCII ranges wrapped in a Unicode package, then |
| 1378 | // there is zero reason to bring out the big guns. We can fit all ASCII |
| 1379 | // ranges within a single sparse state. |
| 1380 | if cls.is_ascii() { |
| 1381 | let end = self.add_empty()?; |
| 1382 | let mut trans = Vec::with_capacity(cls.ranges().len()); |
| 1383 | for r in cls.iter() { |
| 1384 | // The unwraps below are OK because we've verified that this |
| 1385 | // class only contains ASCII codepoints. |
| 1386 | trans.push(Transition { |
| 1387 | // FIXME(1.59): use the 'TryFrom<char> for u8' impl. |
| 1388 | start: u8::try_from(u32::from(r.start())).unwrap(), |
| 1389 | end: u8::try_from(u32::from(r.end())).unwrap(), |
| 1390 | next: end, |
| 1391 | }); |
| 1392 | } |
| 1393 | Ok(ThompsonRef { start: self.add_sparse(trans)?, end }) |
| 1394 | } else if self.is_reverse() { |
| 1395 | if !self.config.get_shrink() { |
| 1396 | // When we don't want to spend the extra time shrinking, we |
| 1397 | // compile the UTF-8 automaton in reverse using something like |
| 1398 | // the "naive" approach, but will attempt to re-use common |
| 1399 | // suffixes. |
| 1400 | self.c_unicode_class_reverse_with_suffix(cls) |
| 1401 | } else { |
| 1402 | // When we want to shrink our NFA for reverse UTF-8 automata, |
| 1403 | // we cannot feed UTF-8 sequences directly to the UTF-8 |
| 1404 | // compiler, since the UTF-8 compiler requires all sequences |
| 1405 | // to be lexicographically sorted. Instead, we organize our |
| 1406 | // sequences into a range trie, which can then output our |
| 1407 | // sequences in the correct order. Unfortunately, building the |
| 1408 | // range trie is fairly expensive (but not nearly as expensive |
| 1409 | // as building a DFA). Hence the reason why the 'shrink' option |
| 1410 | // exists, so that this path can be toggled off. For example, |
| 1411 | // we might want to turn this off if we know we won't be |
| 1412 | // compiling a DFA. |
| 1413 | let mut trie = self.trie_state.borrow_mut(); |
| 1414 | trie.clear(); |
| 1415 | |
| 1416 | for rng in cls.iter() { |
| 1417 | for mut seq in Utf8Sequences::new(rng.start(), rng.end()) { |
| 1418 | seq.reverse(); |
| 1419 | trie.insert(seq.as_slice()); |
| 1420 | } |
| 1421 | } |
| 1422 | let mut builder = self.builder.borrow_mut(); |
| 1423 | let mut utf8_state = self.utf8_state.borrow_mut(); |
| 1424 | let mut utf8c = |
| 1425 | Utf8Compiler::new(&mut *builder, &mut *utf8_state)?; |
| 1426 | trie.iter(|seq| { |
| 1427 | utf8c.add(&seq)?; |
| 1428 | Ok(()) |
| 1429 | })?; |
| 1430 | utf8c.finish() |
| 1431 | } |
| 1432 | } else { |
| 1433 | // In the forward direction, we always shrink our UTF-8 automata |
| 1434 | // because we can stream it right into the UTF-8 compiler. There |
| 1435 | // is almost no downside (in either memory or time) to using this |
| 1436 | // approach. |
| 1437 | let mut builder = self.builder.borrow_mut(); |
| 1438 | let mut utf8_state = self.utf8_state.borrow_mut(); |
| 1439 | let mut utf8c = |
| 1440 | Utf8Compiler::new(&mut *builder, &mut *utf8_state)?; |
| 1441 | for rng in cls.iter() { |
| 1442 | for seq in Utf8Sequences::new(rng.start(), rng.end()) { |
| 1443 | utf8c.add(seq.as_slice())?; |
| 1444 | } |
| 1445 | } |
| 1446 | utf8c.finish() |
| 1447 | } |
| 1448 | |
| 1449 | // For reference, the code below is the "naive" version of compiling a |
| 1450 | // UTF-8 automaton. It is deliciously simple (and works for both the |
| 1451 | // forward and reverse cases), but will unfortunately produce very |
| 1452 | // large NFAs. When compiling a forward automaton, the size difference |
| 1453 | // can sometimes be an order of magnitude. For example, the '\w' regex |
| 1454 | // will generate about ~3000 NFA states using the naive approach below, |
| 1455 | // but only 283 states when using the approach above. This is because |
| 1456 | // the approach above actually compiles a *minimal* (or near minimal, |
| 1457 | // because of the bounded hashmap for reusing equivalent states) UTF-8 |
| 1458 | // automaton. |
| 1459 | // |
| 1460 | // The code below is kept as a reference point in order to make it |
| 1461 | // easier to understand the higher level goal here. Although, it will |
| 1462 | // almost certainly bit-rot, so keep that in mind. Also, if you try to |
| 1463 | // use it, some of the tests in this module will fail because they look |
| 1464 | // for terser byte code produce by the more optimized handling above. |
| 1465 | // But the integration test suite should still pass. |
| 1466 | // |
| 1467 | // One good example of the substantial difference this can make is to |
| 1468 | // compare and contrast performance of the Pike VM when the code below |
| 1469 | // is active vs the code above. Here's an example to try: |
| 1470 | // |
| 1471 | // regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file |
| 1472 | // |
| 1473 | // With Unicode classes generated below, this search takes about 45s on |
| 1474 | // my machine. But with the compressed version above, the search takes |
| 1475 | // only around 1.4s. The NFA is also 20% smaller. This is in part due |
| 1476 | // to the compression, but also because of the utilization of 'sparse' |
| 1477 | // NFA states. They lead to much less state shuffling during the NFA |
| 1478 | // search. |
| 1479 | /* |
| 1480 | let it = cls |
| 1481 | .iter() |
| 1482 | .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end())) |
| 1483 | .map(|seq| { |
| 1484 | let it = seq |
| 1485 | .as_slice() |
| 1486 | .iter() |
| 1487 | .map(|rng| self.c_range(rng.start, rng.end)); |
| 1488 | self.c_concat(it) |
| 1489 | }); |
| 1490 | self.c_alt_iter(it) |
| 1491 | */ |
| 1492 | } |
| 1493 | |
| 1494 | /// Compile the given Unicode character class in reverse with suffix |
| 1495 | /// caching. |
| 1496 | /// |
| 1497 | /// This is a "quick" way to compile large Unicode classes into reverse |
| 1498 | /// UTF-8 automata while doing a small amount of compression on that |
| 1499 | /// automata by reusing common suffixes. |
| 1500 | /// |
| 1501 | /// A more comprehensive compression scheme can be accomplished by using |
| 1502 | /// a range trie to efficiently sort a reverse sequence of UTF-8 byte |
| 1503 | /// rqanges, and then use Daciuk's algorithm via `Utf8Compiler`. |
| 1504 | /// |
| 1505 | /// This is the technique used when "NFA shrinking" is disabled. |
| 1506 | /// |
| 1507 | /// (This also tries to use "sparse" states where possible, just like |
| 1508 | /// `c_byte_class` does.) |
| 1509 | fn c_unicode_class_reverse_with_suffix( |
| 1510 | &self, |
| 1511 | cls: &hir::ClassUnicode, |
| 1512 | ) -> Result<ThompsonRef, BuildError> { |
| 1513 | // N.B. It would likely be better to cache common *prefixes* in the |
| 1514 | // reverse direction, but it's not quite clear how to do that. The |
| 1515 | // advantage of caching suffixes is that it does give us a win, and |
| 1516 | // has a very small additional overhead. |
| 1517 | let mut cache = self.utf8_suffix.borrow_mut(); |
| 1518 | cache.clear(); |
| 1519 | |
| 1520 | let union = self.add_union()?; |
| 1521 | let alt_end = self.add_empty()?; |
| 1522 | for urng in cls.iter() { |
| 1523 | for seq in Utf8Sequences::new(urng.start(), urng.end()) { |
| 1524 | let mut end = alt_end; |
| 1525 | for brng in seq.as_slice() { |
| 1526 | let key = Utf8SuffixKey { |
| 1527 | from: end, |
| 1528 | start: brng.start, |
| 1529 | end: brng.end, |
| 1530 | }; |
| 1531 | let hash = cache.hash(&key); |
| 1532 | if let Some(id) = cache.get(&key, hash) { |
| 1533 | end = id; |
| 1534 | continue; |
| 1535 | } |
| 1536 | |
| 1537 | let compiled = self.c_range(brng.start, brng.end)?; |
| 1538 | self.patch(compiled.end, end)?; |
| 1539 | end = compiled.start; |
| 1540 | cache.set(key, hash, end); |
| 1541 | } |
| 1542 | self.patch(union, end)?; |
| 1543 | } |
| 1544 | } |
| 1545 | Ok(ThompsonRef { start: union, end: alt_end }) |
| 1546 | } |
| 1547 | |
| 1548 | /// Compile the given HIR look-around assertion to an NFA look-around |
| 1549 | /// assertion. |
| 1550 | fn c_look(&self, anchor: &hir::Look) -> Result<ThompsonRef, BuildError> { |
| 1551 | let look = match *anchor { |
| 1552 | hir::Look::Start => Look::Start, |
| 1553 | hir::Look::End => Look::End, |
| 1554 | hir::Look::StartLF => Look::StartLF, |
| 1555 | hir::Look::EndLF => Look::EndLF, |
| 1556 | hir::Look::StartCRLF => Look::StartCRLF, |
| 1557 | hir::Look::EndCRLF => Look::EndCRLF, |
| 1558 | hir::Look::WordAscii => Look::WordAscii, |
| 1559 | hir::Look::WordAsciiNegate => Look::WordAsciiNegate, |
| 1560 | hir::Look::WordUnicode => Look::WordUnicode, |
| 1561 | hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate, |
| 1562 | hir::Look::WordStartAscii => Look::WordStartAscii, |
| 1563 | hir::Look::WordEndAscii => Look::WordEndAscii, |
| 1564 | hir::Look::WordStartUnicode => Look::WordStartUnicode, |
| 1565 | hir::Look::WordEndUnicode => Look::WordEndUnicode, |
| 1566 | hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii, |
| 1567 | hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii, |
| 1568 | hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode, |
| 1569 | hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode, |
| 1570 | }; |
| 1571 | let id = self.add_look(look)?; |
| 1572 | Ok(ThompsonRef { start: id, end: id }) |
| 1573 | } |
| 1574 | |
| 1575 | /// Compile the given byte string to a concatenation of bytes. |
| 1576 | fn c_literal(&self, bytes: &[u8]) -> Result<ThompsonRef, BuildError> { |
| 1577 | self.c_concat(bytes.iter().copied().map(|b| self.c_range(b, b))) |
| 1578 | } |
| 1579 | |
| 1580 | /// Compile a "range" state with one transition that may only be followed |
| 1581 | /// if the input byte is in the (inclusive) range given. |
| 1582 | /// |
| 1583 | /// Both the `start` and `end` locations point to the state created. |
| 1584 | /// Callers will likely want to keep the `start`, but patch the `end` to |
| 1585 | /// point to some other state. |
| 1586 | fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, BuildError> { |
| 1587 | let id = self.add_range(start, end)?; |
| 1588 | Ok(ThompsonRef { start: id, end: id }) |
| 1589 | } |
| 1590 | |
| 1591 | /// Compile an "empty" state with one unconditional epsilon transition. |
| 1592 | /// |
| 1593 | /// Both the `start` and `end` locations point to the state created. |
| 1594 | /// Callers will likely want to keep the `start`, but patch the `end` to |
| 1595 | /// point to some other state. |
| 1596 | fn c_empty(&self) -> Result<ThompsonRef, BuildError> { |
| 1597 | let id = self.add_empty()?; |
| 1598 | Ok(ThompsonRef { start: id, end: id }) |
| 1599 | } |
| 1600 | |
| 1601 | /// Compile a "fail" state that can never have any outgoing transitions. |
| 1602 | fn c_fail(&self) -> Result<ThompsonRef, BuildError> { |
| 1603 | let id = self.add_fail()?; |
| 1604 | Ok(ThompsonRef { start: id, end: id }) |
| 1605 | } |
| 1606 | |
| 1607 | // The below helpers are meant to be simple wrappers around the |
| 1608 | // corresponding Builder methods. For the most part, they let us write |
| 1609 | // 'self.add_foo()' instead of 'self.builder.borrow_mut().add_foo()', where |
| 1610 | // the latter is a mouthful. Some of the methods do inject a little bit |
| 1611 | // of extra logic. e.g., Flipping look-around operators when compiling in |
| 1612 | // reverse mode. |
| 1613 | |
| 1614 | fn patch(&self, from: StateID, to: StateID) -> Result<(), BuildError> { |
| 1615 | self.builder.borrow_mut().patch(from, to) |
| 1616 | } |
| 1617 | |
| 1618 | fn start_pattern(&self) -> Result<PatternID, BuildError> { |
| 1619 | self.builder.borrow_mut().start_pattern() |
| 1620 | } |
| 1621 | |
| 1622 | fn finish_pattern( |
| 1623 | &self, |
| 1624 | start_id: StateID, |
| 1625 | ) -> Result<PatternID, BuildError> { |
| 1626 | self.builder.borrow_mut().finish_pattern(start_id) |
| 1627 | } |
| 1628 | |
| 1629 | fn add_empty(&self) -> Result<StateID, BuildError> { |
| 1630 | self.builder.borrow_mut().add_empty() |
| 1631 | } |
| 1632 | |
| 1633 | fn add_range(&self, start: u8, end: u8) -> Result<StateID, BuildError> { |
| 1634 | self.builder.borrow_mut().add_range(Transition { |
| 1635 | start, |
| 1636 | end, |
| 1637 | next: StateID::ZERO, |
| 1638 | }) |
| 1639 | } |
| 1640 | |
| 1641 | fn add_sparse( |
| 1642 | &self, |
| 1643 | ranges: Vec<Transition>, |
| 1644 | ) -> Result<StateID, BuildError> { |
| 1645 | self.builder.borrow_mut().add_sparse(ranges) |
| 1646 | } |
| 1647 | |
| 1648 | fn add_look(&self, mut look: Look) -> Result<StateID, BuildError> { |
| 1649 | if self.is_reverse() { |
| 1650 | look = look.reversed(); |
| 1651 | } |
| 1652 | self.builder.borrow_mut().add_look(StateID::ZERO, look) |
| 1653 | } |
| 1654 | |
| 1655 | fn add_union(&self) -> Result<StateID, BuildError> { |
| 1656 | self.builder.borrow_mut().add_union(vec![]) |
| 1657 | } |
| 1658 | |
| 1659 | fn add_union_reverse(&self) -> Result<StateID, BuildError> { |
| 1660 | self.builder.borrow_mut().add_union_reverse(vec![]) |
| 1661 | } |
| 1662 | |
| 1663 | fn add_capture_start( |
| 1664 | &self, |
| 1665 | capture_index: u32, |
| 1666 | name: Option<&str>, |
| 1667 | ) -> Result<StateID, BuildError> { |
| 1668 | let name = name.map(|n| Arc::from(n)); |
| 1669 | self.builder.borrow_mut().add_capture_start( |
| 1670 | StateID::ZERO, |
| 1671 | capture_index, |
| 1672 | name, |
| 1673 | ) |
| 1674 | } |
| 1675 | |
| 1676 | fn add_capture_end( |
| 1677 | &self, |
| 1678 | capture_index: u32, |
| 1679 | ) -> Result<StateID, BuildError> { |
| 1680 | self.builder.borrow_mut().add_capture_end(StateID::ZERO, capture_index) |
| 1681 | } |
| 1682 | |
| 1683 | fn add_fail(&self) -> Result<StateID, BuildError> { |
| 1684 | self.builder.borrow_mut().add_fail() |
| 1685 | } |
| 1686 | |
| 1687 | fn add_match(&self) -> Result<StateID, BuildError> { |
| 1688 | self.builder.borrow_mut().add_match() |
| 1689 | } |
| 1690 | |
| 1691 | fn is_reverse(&self) -> bool { |
| 1692 | self.config.get_reverse() |
| 1693 | } |
| 1694 | } |
| 1695 | |
| 1696 | /// A value that represents the result of compiling a sub-expression of a |
| 1697 | /// regex's HIR. Specifically, this represents a sub-graph of the NFA that |
| 1698 | /// has an initial state at `start` and a final state at `end`. |
| 1699 | #[derive (Clone, Copy, Debug)] |
| 1700 | pub(crate) struct ThompsonRef { |
| 1701 | pub(crate) start: StateID, |
| 1702 | pub(crate) end: StateID, |
| 1703 | } |
| 1704 | |
| 1705 | /// A UTF-8 compiler based on Daciuk's algorithm for compilining minimal DFAs |
| 1706 | /// from a lexicographically sorted sequence of strings in linear time. |
| 1707 | /// |
| 1708 | /// The trick here is that any Unicode codepoint range can be converted to |
| 1709 | /// a sequence of byte ranges that form a UTF-8 automaton. Connecting them |
| 1710 | /// together via an alternation is trivial, and indeed, it works. However, |
| 1711 | /// there is a lot of redundant structure in many UTF-8 automatons. Since our |
| 1712 | /// UTF-8 ranges are in lexicographic order, we can use Daciuk's algorithm |
| 1713 | /// to build nearly minimal DFAs in linear time. (They are guaranteed to be |
| 1714 | /// minimal because we use a bounded cache of previously build DFA states.) |
| 1715 | /// |
| 1716 | /// The drawback is that this sadly doesn't work for reverse automata, since |
| 1717 | /// the ranges are no longer in lexicographic order. For that, we invented the |
| 1718 | /// range trie (which gets its own module). Once a range trie is built, we then |
| 1719 | /// use this same Utf8Compiler to build a reverse UTF-8 automaton. |
| 1720 | /// |
| 1721 | /// The high level idea is described here: |
| 1722 | /// https://blog.burntsushi.net/transducers/#finite-state-machines-as-data-structures |
| 1723 | /// |
| 1724 | /// There is also another implementation of this in the `fst` crate. |
| 1725 | #[derive (Debug)] |
| 1726 | struct Utf8Compiler<'a> { |
| 1727 | builder: &'a mut Builder, |
| 1728 | state: &'a mut Utf8State, |
| 1729 | target: StateID, |
| 1730 | } |
| 1731 | |
| 1732 | #[derive (Clone, Debug)] |
| 1733 | struct Utf8State { |
| 1734 | compiled: Utf8BoundedMap, |
| 1735 | uncompiled: Vec<Utf8Node>, |
| 1736 | } |
| 1737 | |
| 1738 | #[derive (Clone, Debug)] |
| 1739 | struct Utf8Node { |
| 1740 | trans: Vec<Transition>, |
| 1741 | last: Option<Utf8LastTransition>, |
| 1742 | } |
| 1743 | |
| 1744 | #[derive (Clone, Debug)] |
| 1745 | struct Utf8LastTransition { |
| 1746 | start: u8, |
| 1747 | end: u8, |
| 1748 | } |
| 1749 | |
| 1750 | impl Utf8State { |
| 1751 | fn new() -> Utf8State { |
| 1752 | Utf8State { compiled: Utf8BoundedMap::new(capacity:10_000), uncompiled: vec![] } |
| 1753 | } |
| 1754 | |
| 1755 | fn clear(&mut self) { |
| 1756 | self.compiled.clear(); |
| 1757 | self.uncompiled.clear(); |
| 1758 | } |
| 1759 | } |
| 1760 | |
| 1761 | impl<'a> Utf8Compiler<'a> { |
| 1762 | fn new( |
| 1763 | builder: &'a mut Builder, |
| 1764 | state: &'a mut Utf8State, |
| 1765 | ) -> Result<Utf8Compiler<'a>, BuildError> { |
| 1766 | let target = builder.add_empty()?; |
| 1767 | state.clear(); |
| 1768 | let mut utf8c = Utf8Compiler { builder, state, target }; |
| 1769 | utf8c.add_empty(); |
| 1770 | Ok(utf8c) |
| 1771 | } |
| 1772 | |
| 1773 | fn finish(&mut self) -> Result<ThompsonRef, BuildError> { |
| 1774 | self.compile_from(0)?; |
| 1775 | let node = self.pop_root(); |
| 1776 | let start = self.compile(node)?; |
| 1777 | Ok(ThompsonRef { start, end: self.target }) |
| 1778 | } |
| 1779 | |
| 1780 | fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), BuildError> { |
| 1781 | let prefix_len = ranges |
| 1782 | .iter() |
| 1783 | .zip(&self.state.uncompiled) |
| 1784 | .take_while(|&(range, node)| { |
| 1785 | node.last.as_ref().map_or(false, |t| { |
| 1786 | (t.start, t.end) == (range.start, range.end) |
| 1787 | }) |
| 1788 | }) |
| 1789 | .count(); |
| 1790 | assert!(prefix_len < ranges.len()); |
| 1791 | self.compile_from(prefix_len)?; |
| 1792 | self.add_suffix(&ranges[prefix_len..]); |
| 1793 | Ok(()) |
| 1794 | } |
| 1795 | |
| 1796 | fn compile_from(&mut self, from: usize) -> Result<(), BuildError> { |
| 1797 | let mut next = self.target; |
| 1798 | while from + 1 < self.state.uncompiled.len() { |
| 1799 | let node = self.pop_freeze(next); |
| 1800 | next = self.compile(node)?; |
| 1801 | } |
| 1802 | self.top_last_freeze(next); |
| 1803 | Ok(()) |
| 1804 | } |
| 1805 | |
| 1806 | fn compile( |
| 1807 | &mut self, |
| 1808 | node: Vec<Transition>, |
| 1809 | ) -> Result<StateID, BuildError> { |
| 1810 | let hash = self.state.compiled.hash(&node); |
| 1811 | if let Some(id) = self.state.compiled.get(&node, hash) { |
| 1812 | return Ok(id); |
| 1813 | } |
| 1814 | let id = self.builder.add_sparse(node.clone())?; |
| 1815 | self.state.compiled.set(node, hash, id); |
| 1816 | Ok(id) |
| 1817 | } |
| 1818 | |
| 1819 | fn add_suffix(&mut self, ranges: &[Utf8Range]) { |
| 1820 | assert!(!ranges.is_empty()); |
| 1821 | let last = self |
| 1822 | .state |
| 1823 | .uncompiled |
| 1824 | .len() |
| 1825 | .checked_sub(1) |
| 1826 | .expect("non-empty nodes" ); |
| 1827 | assert!(self.state.uncompiled[last].last.is_none()); |
| 1828 | self.state.uncompiled[last].last = Some(Utf8LastTransition { |
| 1829 | start: ranges[0].start, |
| 1830 | end: ranges[0].end, |
| 1831 | }); |
| 1832 | for r in &ranges[1..] { |
| 1833 | self.state.uncompiled.push(Utf8Node { |
| 1834 | trans: vec![], |
| 1835 | last: Some(Utf8LastTransition { start: r.start, end: r.end }), |
| 1836 | }); |
| 1837 | } |
| 1838 | } |
| 1839 | |
| 1840 | fn add_empty(&mut self) { |
| 1841 | self.state.uncompiled.push(Utf8Node { trans: vec![], last: None }); |
| 1842 | } |
| 1843 | |
| 1844 | fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> { |
| 1845 | let mut uncompiled = self.state.uncompiled.pop().unwrap(); |
| 1846 | uncompiled.set_last_transition(next); |
| 1847 | uncompiled.trans |
| 1848 | } |
| 1849 | |
| 1850 | fn pop_root(&mut self) -> Vec<Transition> { |
| 1851 | assert_eq!(self.state.uncompiled.len(), 1); |
| 1852 | assert!(self.state.uncompiled[0].last.is_none()); |
| 1853 | self.state.uncompiled.pop().expect("non-empty nodes" ).trans |
| 1854 | } |
| 1855 | |
| 1856 | fn top_last_freeze(&mut self, next: StateID) { |
| 1857 | let last = self |
| 1858 | .state |
| 1859 | .uncompiled |
| 1860 | .len() |
| 1861 | .checked_sub(1) |
| 1862 | .expect("non-empty nodes" ); |
| 1863 | self.state.uncompiled[last].set_last_transition(next); |
| 1864 | } |
| 1865 | } |
| 1866 | |
| 1867 | impl Utf8Node { |
| 1868 | fn set_last_transition(&mut self, next: StateID) { |
| 1869 | if let Some(last: Utf8LastTransition) = self.last.take() { |
| 1870 | self.trans.push(Transition { |
| 1871 | start: last.start, |
| 1872 | end: last.end, |
| 1873 | next, |
| 1874 | }); |
| 1875 | } |
| 1876 | } |
| 1877 | } |
| 1878 | |
| 1879 | #[cfg (test)] |
| 1880 | mod tests { |
| 1881 | use alloc::vec; |
| 1882 | |
| 1883 | use crate::{ |
| 1884 | nfa::thompson::{SparseTransitions, State}, |
| 1885 | util::primitives::SmallIndex, |
| 1886 | }; |
| 1887 | |
| 1888 | use super::*; |
| 1889 | |
| 1890 | fn build(pattern: &str) -> NFA { |
| 1891 | NFA::compiler() |
| 1892 | .configure( |
| 1893 | NFA::config() |
| 1894 | .which_captures(WhichCaptures::None) |
| 1895 | .unanchored_prefix(false), |
| 1896 | ) |
| 1897 | .build(pattern) |
| 1898 | .unwrap() |
| 1899 | } |
| 1900 | |
| 1901 | fn pid(id: usize) -> PatternID { |
| 1902 | PatternID::new(id).unwrap() |
| 1903 | } |
| 1904 | |
| 1905 | fn sid(id: usize) -> StateID { |
| 1906 | StateID::new(id).unwrap() |
| 1907 | } |
| 1908 | |
| 1909 | fn s_byte(byte: u8, next: usize) -> State { |
| 1910 | let next = sid(next); |
| 1911 | let trans = Transition { start: byte, end: byte, next }; |
| 1912 | State::ByteRange { trans } |
| 1913 | } |
| 1914 | |
| 1915 | fn s_range(start: u8, end: u8, next: usize) -> State { |
| 1916 | let next = sid(next); |
| 1917 | let trans = Transition { start, end, next }; |
| 1918 | State::ByteRange { trans } |
| 1919 | } |
| 1920 | |
| 1921 | fn s_sparse(transitions: &[(u8, u8, usize)]) -> State { |
| 1922 | let transitions = transitions |
| 1923 | .iter() |
| 1924 | .map(|&(start, end, next)| Transition { |
| 1925 | start, |
| 1926 | end, |
| 1927 | next: sid(next), |
| 1928 | }) |
| 1929 | .collect(); |
| 1930 | State::Sparse(SparseTransitions { transitions }) |
| 1931 | } |
| 1932 | |
| 1933 | fn s_look(look: Look, next: usize) -> State { |
| 1934 | let next = sid(next); |
| 1935 | State::Look { look, next } |
| 1936 | } |
| 1937 | |
| 1938 | fn s_bin_union(alt1: usize, alt2: usize) -> State { |
| 1939 | State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) } |
| 1940 | } |
| 1941 | |
| 1942 | fn s_union(alts: &[usize]) -> State { |
| 1943 | State::Union { |
| 1944 | alternates: alts |
| 1945 | .iter() |
| 1946 | .map(|&id| sid(id)) |
| 1947 | .collect::<Vec<StateID>>() |
| 1948 | .into_boxed_slice(), |
| 1949 | } |
| 1950 | } |
| 1951 | |
| 1952 | fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State { |
| 1953 | State::Capture { |
| 1954 | next: sid(next), |
| 1955 | pattern_id: pid(pattern), |
| 1956 | group_index: SmallIndex::new(index).unwrap(), |
| 1957 | slot: SmallIndex::new(slot).unwrap(), |
| 1958 | } |
| 1959 | } |
| 1960 | |
| 1961 | fn s_fail() -> State { |
| 1962 | State::Fail |
| 1963 | } |
| 1964 | |
| 1965 | fn s_match(id: usize) -> State { |
| 1966 | State::Match { pattern_id: pid(id) } |
| 1967 | } |
| 1968 | |
| 1969 | // Test that building an unanchored NFA has an appropriate `(?s:.)*?` |
| 1970 | // prefix. |
| 1971 | #[test ] |
| 1972 | fn compile_unanchored_prefix() { |
| 1973 | let nfa = NFA::compiler() |
| 1974 | .configure(NFA::config().which_captures(WhichCaptures::None)) |
| 1975 | .build(r"a" ) |
| 1976 | .unwrap(); |
| 1977 | assert_eq!( |
| 1978 | nfa.states(), |
| 1979 | &[ |
| 1980 | s_bin_union(2, 1), |
| 1981 | s_range(0, 255, 0), |
| 1982 | s_byte(b'a' , 3), |
| 1983 | s_match(0), |
| 1984 | ] |
| 1985 | ); |
| 1986 | } |
| 1987 | |
| 1988 | #[test ] |
| 1989 | fn compile_no_unanchored_prefix_with_start_anchor() { |
| 1990 | let nfa = NFA::compiler() |
| 1991 | .configure(NFA::config().which_captures(WhichCaptures::None)) |
| 1992 | .build(r"^a" ) |
| 1993 | .unwrap(); |
| 1994 | assert_eq!( |
| 1995 | nfa.states(), |
| 1996 | &[s_look(Look::Start, 1), s_byte(b'a' , 2), s_match(0)] |
| 1997 | ); |
| 1998 | } |
| 1999 | |
| 2000 | #[test ] |
| 2001 | fn compile_yes_unanchored_prefix_with_end_anchor() { |
| 2002 | let nfa = NFA::compiler() |
| 2003 | .configure(NFA::config().which_captures(WhichCaptures::None)) |
| 2004 | .build(r"a$" ) |
| 2005 | .unwrap(); |
| 2006 | assert_eq!( |
| 2007 | nfa.states(), |
| 2008 | &[ |
| 2009 | s_bin_union(2, 1), |
| 2010 | s_range(0, 255, 0), |
| 2011 | s_byte(b'a' , 3), |
| 2012 | s_look(Look::End, 4), |
| 2013 | s_match(0), |
| 2014 | ] |
| 2015 | ); |
| 2016 | } |
| 2017 | |
| 2018 | #[test ] |
| 2019 | fn compile_yes_reverse_unanchored_prefix_with_start_anchor() { |
| 2020 | let nfa = NFA::compiler() |
| 2021 | .configure( |
| 2022 | NFA::config() |
| 2023 | .reverse(true) |
| 2024 | .which_captures(WhichCaptures::None), |
| 2025 | ) |
| 2026 | .build(r"^a" ) |
| 2027 | .unwrap(); |
| 2028 | assert_eq!( |
| 2029 | nfa.states(), |
| 2030 | &[ |
| 2031 | s_bin_union(2, 1), |
| 2032 | s_range(0, 255, 0), |
| 2033 | s_byte(b'a' , 3), |
| 2034 | // Anchors get flipped in a reverse automaton. |
| 2035 | s_look(Look::End, 4), |
| 2036 | s_match(0), |
| 2037 | ], |
| 2038 | ); |
| 2039 | } |
| 2040 | |
| 2041 | #[test ] |
| 2042 | fn compile_no_reverse_unanchored_prefix_with_end_anchor() { |
| 2043 | let nfa = NFA::compiler() |
| 2044 | .configure( |
| 2045 | NFA::config() |
| 2046 | .reverse(true) |
| 2047 | .which_captures(WhichCaptures::None), |
| 2048 | ) |
| 2049 | .build(r"a$" ) |
| 2050 | .unwrap(); |
| 2051 | assert_eq!( |
| 2052 | nfa.states(), |
| 2053 | &[ |
| 2054 | // Anchors get flipped in a reverse automaton. |
| 2055 | s_look(Look::Start, 1), |
| 2056 | s_byte(b'a' , 2), |
| 2057 | s_match(0), |
| 2058 | ], |
| 2059 | ); |
| 2060 | } |
| 2061 | |
| 2062 | #[test ] |
| 2063 | fn compile_empty() { |
| 2064 | assert_eq!(build("" ).states(), &[s_match(0),]); |
| 2065 | } |
| 2066 | |
| 2067 | #[test ] |
| 2068 | fn compile_literal() { |
| 2069 | assert_eq!(build("a" ).states(), &[s_byte(b'a' , 1), s_match(0),]); |
| 2070 | assert_eq!( |
| 2071 | build("ab" ).states(), |
| 2072 | &[s_byte(b'a' , 1), s_byte(b'b' , 2), s_match(0),] |
| 2073 | ); |
| 2074 | assert_eq!( |
| 2075 | build("ā" ).states(), |
| 2076 | &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)] |
| 2077 | ); |
| 2078 | |
| 2079 | // Check that non-UTF-8 literals work. |
| 2080 | let nfa = NFA::compiler() |
| 2081 | .configure( |
| 2082 | NFA::config() |
| 2083 | .which_captures(WhichCaptures::None) |
| 2084 | .unanchored_prefix(false), |
| 2085 | ) |
| 2086 | .syntax(crate::util::syntax::Config::new().utf8(false)) |
| 2087 | .build(r"(?-u)\xFF" ) |
| 2088 | .unwrap(); |
| 2089 | assert_eq!(nfa.states(), &[s_byte(b' \xFF' , 1), s_match(0),]); |
| 2090 | } |
| 2091 | |
| 2092 | #[test ] |
| 2093 | fn compile_class_ascii() { |
| 2094 | assert_eq!( |
| 2095 | build(r"[a-z]" ).states(), |
| 2096 | &[s_range(b'a' , b'z' , 1), s_match(0),] |
| 2097 | ); |
| 2098 | assert_eq!( |
| 2099 | build(r"[x-za-c]" ).states(), |
| 2100 | &[s_sparse(&[(b'a' , b'c' , 1), (b'x' , b'z' , 1)]), s_match(0)] |
| 2101 | ); |
| 2102 | } |
| 2103 | |
| 2104 | #[test ] |
| 2105 | #[cfg (not(miri))] |
| 2106 | fn compile_class_unicode() { |
| 2107 | assert_eq!( |
| 2108 | build(r"[\u03B1-\u03B4]" ).states(), |
| 2109 | &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)] |
| 2110 | ); |
| 2111 | assert_eq!( |
| 2112 | build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]" ).states(), |
| 2113 | &[ |
| 2114 | s_range(0xB1, 0xB4, 5), |
| 2115 | s_range(0x99, 0x9E, 5), |
| 2116 | s_byte(0xA4, 1), |
| 2117 | s_byte(0x9F, 2), |
| 2118 | s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]), |
| 2119 | s_match(0), |
| 2120 | ] |
| 2121 | ); |
| 2122 | assert_eq!( |
| 2123 | build(r"[a-zā]" ).states(), |
| 2124 | &[ |
| 2125 | s_byte(0x83, 3), |
| 2126 | s_byte(0x98, 0), |
| 2127 | s_sparse(&[(b'a' , b'z' , 3), (0xE2, 0xE2, 1)]), |
| 2128 | s_match(0), |
| 2129 | ] |
| 2130 | ); |
| 2131 | } |
| 2132 | |
| 2133 | #[test ] |
| 2134 | fn compile_repetition() { |
| 2135 | assert_eq!( |
| 2136 | build(r"a?" ).states(), |
| 2137 | &[s_bin_union(1, 2), s_byte(b'a' , 2), s_match(0),] |
| 2138 | ); |
| 2139 | assert_eq!( |
| 2140 | build(r"a??" ).states(), |
| 2141 | &[s_bin_union(2, 1), s_byte(b'a' , 2), s_match(0),] |
| 2142 | ); |
| 2143 | } |
| 2144 | |
| 2145 | #[test ] |
| 2146 | fn compile_group() { |
| 2147 | assert_eq!( |
| 2148 | build(r"ab+" ).states(), |
| 2149 | &[s_byte(b'a' , 1), s_byte(b'b' , 2), s_bin_union(1, 3), s_match(0)] |
| 2150 | ); |
| 2151 | assert_eq!( |
| 2152 | build(r"(ab)" ).states(), |
| 2153 | &[s_byte(b'a' , 1), s_byte(b'b' , 2), s_match(0)] |
| 2154 | ); |
| 2155 | assert_eq!( |
| 2156 | build(r"(ab)+" ).states(), |
| 2157 | &[s_byte(b'a' , 1), s_byte(b'b' , 2), s_bin_union(0, 3), s_match(0)] |
| 2158 | ); |
| 2159 | } |
| 2160 | |
| 2161 | #[test ] |
| 2162 | fn compile_alternation() { |
| 2163 | assert_eq!( |
| 2164 | build(r"a|b" ).states(), |
| 2165 | &[s_range(b'a' , b'b' , 1), s_match(0)] |
| 2166 | ); |
| 2167 | assert_eq!( |
| 2168 | build(r"ab|cd" ).states(), |
| 2169 | &[ |
| 2170 | s_byte(b'b' , 3), |
| 2171 | s_byte(b'd' , 3), |
| 2172 | s_sparse(&[(b'a' , b'a' , 0), (b'c' , b'c' , 1)]), |
| 2173 | s_match(0) |
| 2174 | ], |
| 2175 | ); |
| 2176 | assert_eq!( |
| 2177 | build(r"|b" ).states(), |
| 2178 | &[s_byte(b'b' , 2), s_bin_union(2, 0), s_match(0)] |
| 2179 | ); |
| 2180 | assert_eq!( |
| 2181 | build(r"a|" ).states(), |
| 2182 | &[s_byte(b'a' , 2), s_bin_union(0, 2), s_match(0)] |
| 2183 | ); |
| 2184 | } |
| 2185 | |
| 2186 | // This tests the use of a non-binary union, i.e., a state with more than |
| 2187 | // 2 unconditional epsilon transitions. The only place they tend to appear |
| 2188 | // is in reverse NFAs when shrinking is disabled. Otherwise, 'binary-union' |
| 2189 | // and 'sparse' tend to cover all other cases of alternation. |
| 2190 | #[test ] |
| 2191 | fn compile_non_binary_union() { |
| 2192 | let nfa = NFA::compiler() |
| 2193 | .configure( |
| 2194 | NFA::config() |
| 2195 | .which_captures(WhichCaptures::None) |
| 2196 | .reverse(true) |
| 2197 | .shrink(false) |
| 2198 | .unanchored_prefix(false), |
| 2199 | ) |
| 2200 | .build(r"[\u1000\u2000\u3000]" ) |
| 2201 | .unwrap(); |
| 2202 | assert_eq!( |
| 2203 | nfa.states(), |
| 2204 | &[ |
| 2205 | s_union(&[3, 6, 9]), |
| 2206 | s_byte(0xE1, 10), |
| 2207 | s_byte(0x80, 1), |
| 2208 | s_byte(0x80, 2), |
| 2209 | s_byte(0xE2, 10), |
| 2210 | s_byte(0x80, 4), |
| 2211 | s_byte(0x80, 5), |
| 2212 | s_byte(0xE3, 10), |
| 2213 | s_byte(0x80, 7), |
| 2214 | s_byte(0x80, 8), |
| 2215 | s_match(0), |
| 2216 | ] |
| 2217 | ); |
| 2218 | } |
| 2219 | |
| 2220 | #[test ] |
| 2221 | fn compile_many_start_pattern() { |
| 2222 | let nfa = NFA::compiler() |
| 2223 | .configure( |
| 2224 | NFA::config() |
| 2225 | .which_captures(WhichCaptures::None) |
| 2226 | .unanchored_prefix(false), |
| 2227 | ) |
| 2228 | .build_many(&["a" , "b" ]) |
| 2229 | .unwrap(); |
| 2230 | assert_eq!( |
| 2231 | nfa.states(), |
| 2232 | &[ |
| 2233 | s_byte(b'a' , 1), |
| 2234 | s_match(0), |
| 2235 | s_byte(b'b' , 3), |
| 2236 | s_match(1), |
| 2237 | s_bin_union(0, 2), |
| 2238 | ] |
| 2239 | ); |
| 2240 | assert_eq!(nfa.start_anchored().as_usize(), 4); |
| 2241 | assert_eq!(nfa.start_unanchored().as_usize(), 4); |
| 2242 | // Test that the start states for each individual pattern are correct. |
| 2243 | assert_eq!(nfa.start_pattern(pid(0)).unwrap(), sid(0)); |
| 2244 | assert_eq!(nfa.start_pattern(pid(1)).unwrap(), sid(2)); |
| 2245 | } |
| 2246 | |
| 2247 | // This tests that our compiler can handle an empty character class. At the |
| 2248 | // time of writing, the regex parser forbids it, so the only way to test it |
| 2249 | // is to provide a hand written HIR. |
| 2250 | #[test ] |
| 2251 | fn empty_class_bytes() { |
| 2252 | use regex_syntax::hir::{Class, ClassBytes, Hir}; |
| 2253 | |
| 2254 | let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![]))); |
| 2255 | let config = NFA::config() |
| 2256 | .which_captures(WhichCaptures::None) |
| 2257 | .unanchored_prefix(false); |
| 2258 | let nfa = |
| 2259 | NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); |
| 2260 | assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); |
| 2261 | } |
| 2262 | |
| 2263 | // Like empty_class_bytes, but for a Unicode class. |
| 2264 | #[test ] |
| 2265 | fn empty_class_unicode() { |
| 2266 | use regex_syntax::hir::{Class, ClassUnicode, Hir}; |
| 2267 | |
| 2268 | let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![]))); |
| 2269 | let config = NFA::config() |
| 2270 | .which_captures(WhichCaptures::None) |
| 2271 | .unanchored_prefix(false); |
| 2272 | let nfa = |
| 2273 | NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); |
| 2274 | assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); |
| 2275 | } |
| 2276 | |
| 2277 | #[test ] |
| 2278 | fn compile_captures_all() { |
| 2279 | let nfa = NFA::compiler() |
| 2280 | .configure( |
| 2281 | NFA::config() |
| 2282 | .unanchored_prefix(false) |
| 2283 | .which_captures(WhichCaptures::All), |
| 2284 | ) |
| 2285 | .build("a(b)c" ) |
| 2286 | .unwrap(); |
| 2287 | assert_eq!( |
| 2288 | nfa.states(), |
| 2289 | &[ |
| 2290 | s_cap(1, 0, 0, 0), |
| 2291 | s_byte(b'a' , 2), |
| 2292 | s_cap(3, 0, 1, 2), |
| 2293 | s_byte(b'b' , 4), |
| 2294 | s_cap(5, 0, 1, 3), |
| 2295 | s_byte(b'c' , 6), |
| 2296 | s_cap(7, 0, 0, 1), |
| 2297 | s_match(0) |
| 2298 | ] |
| 2299 | ); |
| 2300 | let ginfo = nfa.group_info(); |
| 2301 | assert_eq!(2, ginfo.all_group_len()); |
| 2302 | } |
| 2303 | |
| 2304 | #[test ] |
| 2305 | fn compile_captures_implicit() { |
| 2306 | let nfa = NFA::compiler() |
| 2307 | .configure( |
| 2308 | NFA::config() |
| 2309 | .unanchored_prefix(false) |
| 2310 | .which_captures(WhichCaptures::Implicit), |
| 2311 | ) |
| 2312 | .build("a(b)c" ) |
| 2313 | .unwrap(); |
| 2314 | assert_eq!( |
| 2315 | nfa.states(), |
| 2316 | &[ |
| 2317 | s_cap(1, 0, 0, 0), |
| 2318 | s_byte(b'a' , 2), |
| 2319 | s_byte(b'b' , 3), |
| 2320 | s_byte(b'c' , 4), |
| 2321 | s_cap(5, 0, 0, 1), |
| 2322 | s_match(0) |
| 2323 | ] |
| 2324 | ); |
| 2325 | let ginfo = nfa.group_info(); |
| 2326 | assert_eq!(1, ginfo.all_group_len()); |
| 2327 | } |
| 2328 | |
| 2329 | #[test ] |
| 2330 | fn compile_captures_none() { |
| 2331 | let nfa = NFA::compiler() |
| 2332 | .configure( |
| 2333 | NFA::config() |
| 2334 | .unanchored_prefix(false) |
| 2335 | .which_captures(WhichCaptures::None), |
| 2336 | ) |
| 2337 | .build("a(b)c" ) |
| 2338 | .unwrap(); |
| 2339 | assert_eq!( |
| 2340 | nfa.states(), |
| 2341 | &[s_byte(b'a' , 1), s_byte(b'b' , 2), s_byte(b'c' , 3), s_match(0)] |
| 2342 | ); |
| 2343 | let ginfo = nfa.group_info(); |
| 2344 | assert_eq!(0, ginfo.all_group_len()); |
| 2345 | } |
| 2346 | } |
| 2347 | |