| 1 | /*! |
| 2 | Types and routines that support the search APIs of most regex engines. |
| 3 | |
| 4 | This sub-module isn't exposed directly, but rather, its contents are exported |
| 5 | at the crate root due to the universality of most of the types and routines in |
| 6 | this module. |
| 7 | */ |
| 8 | |
| 9 | use core::ops::{Range, RangeBounds}; |
| 10 | |
| 11 | use crate::util::{escape::DebugByte, primitives::PatternID, utf8}; |
| 12 | |
| 13 | /// The parameters for a regex search including the haystack to search. |
| 14 | /// |
| 15 | /// It turns out that regex searches have a few parameters, and in most cases, |
| 16 | /// those parameters have defaults that work in the vast majority of cases. |
| 17 | /// This `Input` type exists to make that common case seamnless while also |
| 18 | /// providing an avenue for changing the parameters of a search. In particular, |
| 19 | /// this type enables doing so without a combinatorial explosion of different |
| 20 | /// methods and/or superfluous parameters in the common cases. |
| 21 | /// |
| 22 | /// An `Input` permits configuring the following things: |
| 23 | /// |
| 24 | /// * Search only a substring of a haystack, while taking the broader context |
| 25 | /// into account for resolving look-around assertions. |
| 26 | /// * Indicating whether to search for all patterns in a regex, or to |
| 27 | /// only search for one pattern in particular. |
| 28 | /// * Whether to perform an anchored on unanchored search. |
| 29 | /// * Whether to report a match as early as possible. |
| 30 | /// |
| 31 | /// All of these parameters, except for the haystack, have sensible default |
| 32 | /// values. This means that the minimal search configuration is simply a call |
| 33 | /// to [`Input::new`] with your haystack. Setting any other parameter is |
| 34 | /// optional. |
| 35 | /// |
| 36 | /// Moreover, for any `H` that implements `AsRef<[u8]>`, there exists a |
| 37 | /// `From<H> for Input` implementation. This is useful because many of the |
| 38 | /// search APIs in this crate accept an `Into<Input>`. This means you can |
| 39 | /// provide string or byte strings to these routines directly, and they'll |
| 40 | /// automatically get converted into an `Input` for you. |
| 41 | /// |
| 42 | /// The lifetime parameter `'h` refers to the lifetime of the haystack. |
| 43 | /// |
| 44 | /// # Organization |
| 45 | /// |
| 46 | /// The API of `Input` is split into a few different parts: |
| 47 | /// |
| 48 | /// * A builder-like API that transforms a `Input` by value. Examples: |
| 49 | /// [`Input::span`] and [`Input::anchored`]. |
| 50 | /// * A setter API that permits mutating parameters in place. Examples: |
| 51 | /// [`Input::set_span`] and [`Input::set_anchored`]. |
| 52 | /// * A getter API that permits retrieving any of the search parameters. |
| 53 | /// Examples: [`Input::get_span`] and [`Input::get_anchored`]. |
| 54 | /// * A few convenience getter routines that don't conform to the above naming |
| 55 | /// pattern due to how common they are. Examples: [`Input::haystack`], |
| 56 | /// [`Input::start`] and [`Input::end`]. |
| 57 | /// * Miscellaneous predicates and other helper routines that are useful |
| 58 | /// in some contexts. Examples: [`Input::is_char_boundary`]. |
| 59 | /// |
| 60 | /// A `Input` exposes so much because it is meant to be used by both callers of |
| 61 | /// regex engines _and_ implementors of regex engines. A constraining factor is |
| 62 | /// that regex engines should accept a `&Input` as its lowest level API, which |
| 63 | /// means that implementors should only use the "getter" APIs of a `Input`. |
| 64 | /// |
| 65 | /// # Valid bounds and search termination |
| 66 | /// |
| 67 | /// An `Input` permits setting the bounds of a search via either |
| 68 | /// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or |
| 69 | /// else a panic will occur. Bounds are valid if and only if: |
| 70 | /// |
| 71 | /// * The bounds represent a valid range into the input's haystack. |
| 72 | /// * **or** the end bound is a valid ending bound for the haystack *and* |
| 73 | /// the start bound is exactly one greater than the start bound. |
| 74 | /// |
| 75 | /// In the latter case, [`Input::is_done`] will return true and indicates any |
| 76 | /// search receiving such an input should immediately return with no match. |
| 77 | /// |
| 78 | /// Note that while `Input` is used for reverse searches in this crate, the |
| 79 | /// `Input::is_done` predicate assumes a forward search. Because unsigned |
| 80 | /// offsets are used internally, there is no way to tell from only the offsets |
| 81 | /// whether a reverse search is done or not. |
| 82 | /// |
| 83 | /// # Regex engine support |
| 84 | /// |
| 85 | /// Any regex engine accepting an `Input` must support at least the following |
| 86 | /// things: |
| 87 | /// |
| 88 | /// * Searching a `&[u8]` for matches. |
| 89 | /// * Searching a substring of `&[u8]` for a match, such that any match |
| 90 | /// reported must appear entirely within that substring. |
| 91 | /// * For a forwards search, a match should never be reported when |
| 92 | /// [`Input::is_done`] returns true. (For reverse searches, termination should |
| 93 | /// be handled outside of `Input`.) |
| 94 | /// |
| 95 | /// Supporting other aspects of an `Input` are optional, but regex engines |
| 96 | /// should handle aspects they don't support gracefully. How this is done is |
| 97 | /// generally up to the regex engine. This crate generally treats unsupported |
| 98 | /// anchored modes as an error to report for example, but for simplicity, in |
| 99 | /// the meta regex engine, trying to search with an invalid pattern ID just |
| 100 | /// results in no match being reported. |
| 101 | #[derive (Clone)] |
| 102 | pub struct Input<'h> { |
| 103 | haystack: &'h [u8], |
| 104 | span: Span, |
| 105 | anchored: Anchored, |
| 106 | earliest: bool, |
| 107 | } |
| 108 | |
| 109 | impl<'h> Input<'h> { |
| 110 | /// Create a new search configuration for the given haystack. |
| 111 | #[inline ] |
| 112 | pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> { |
| 113 | // Perform only one call to `haystack.as_ref()` to protect from incorrect |
| 114 | // implementations that return different values from multiple calls. |
| 115 | // This is important because there's code that relies on `span` not being |
| 116 | // out of bounds with respect to the stored `haystack`. |
| 117 | let haystack = haystack.as_ref(); |
| 118 | Input { |
| 119 | haystack, |
| 120 | span: Span { start: 0, end: haystack.len() }, |
| 121 | anchored: Anchored::No, |
| 122 | earliest: false, |
| 123 | } |
| 124 | } |
| 125 | |
| 126 | /// Set the span for this search. |
| 127 | /// |
| 128 | /// This routine does not panic if the span given is not a valid range for |
| 129 | /// this search's haystack. If this search is run with an invalid range, |
| 130 | /// then the most likely outcome is that the actual search execution will |
| 131 | /// panic. |
| 132 | /// |
| 133 | /// This routine is generic over how a span is provided. While |
| 134 | /// a [`Span`] may be given directly, one may also provide a |
| 135 | /// `std::ops::Range<usize>`. To provide anything supported by range |
| 136 | /// syntax, use the [`Input::range`] method. |
| 137 | /// |
| 138 | /// The default span is the entire haystack. |
| 139 | /// |
| 140 | /// Note that [`Input::range`] overrides this method and vice versa. |
| 141 | /// |
| 142 | /// # Panics |
| 143 | /// |
| 144 | /// This panics if the given span does not correspond to valid bounds in |
| 145 | /// the haystack or the termination of a search. |
| 146 | /// |
| 147 | /// # Example |
| 148 | /// |
| 149 | /// This example shows how the span of the search can impact whether a |
| 150 | /// match is reported or not. This is particularly relevant for look-around |
| 151 | /// operators, which might take things outside of the span into account |
| 152 | /// when determining whether they match. |
| 153 | /// |
| 154 | /// ``` |
| 155 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 156 | /// use regex_automata::{ |
| 157 | /// nfa::thompson::pikevm::PikeVM, |
| 158 | /// Match, Input, |
| 159 | /// }; |
| 160 | /// |
| 161 | /// // Look for 'at', but as a distinct word. |
| 162 | /// let re = PikeVM::new(r"\bat\b" )?; |
| 163 | /// let mut cache = re.create_cache(); |
| 164 | /// let mut caps = re.create_captures(); |
| 165 | /// |
| 166 | /// // Our haystack contains 'at', but not as a distinct word. |
| 167 | /// let haystack = "batter" ; |
| 168 | /// |
| 169 | /// // A standard search finds nothing, as expected. |
| 170 | /// let input = Input::new(haystack); |
| 171 | /// re.search(&mut cache, &input, &mut caps); |
| 172 | /// assert_eq!(None, caps.get_match()); |
| 173 | /// |
| 174 | /// // But if we wanted to search starting at position '1', we might |
| 175 | /// // slice the haystack. If we do this, it's impossible for the \b |
| 176 | /// // anchors to take the surrounding context into account! And thus, |
| 177 | /// // a match is produced. |
| 178 | /// let input = Input::new(&haystack[1..3]); |
| 179 | /// re.search(&mut cache, &input, &mut caps); |
| 180 | /// assert_eq!(Some(Match::must(0, 0..2)), caps.get_match()); |
| 181 | /// |
| 182 | /// // But if we specify the span of the search instead of slicing the |
| 183 | /// // haystack, then the regex engine can "see" outside of the span |
| 184 | /// // and resolve the anchors correctly. |
| 185 | /// let input = Input::new(haystack).span(1..3); |
| 186 | /// re.search(&mut cache, &input, &mut caps); |
| 187 | /// assert_eq!(None, caps.get_match()); |
| 188 | /// |
| 189 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 190 | /// ``` |
| 191 | /// |
| 192 | /// This may seem a little ham-fisted, but this scenario tends to come up |
| 193 | /// if some other regex engine found the match span and now you need to |
| 194 | /// re-process that span to look for capturing groups. (e.g., Run a faster |
| 195 | /// DFA first, find a match, then run the PikeVM on just the match span to |
| 196 | /// resolve capturing groups.) In order to implement that sort of logic |
| 197 | /// correctly, you need to set the span on the search instead of slicing |
| 198 | /// the haystack directly. |
| 199 | /// |
| 200 | /// The other advantage of using this routine to specify the bounds of the |
| 201 | /// search is that the match offsets are still reported in terms of the |
| 202 | /// original haystack. For example, the second search in the example above |
| 203 | /// reported a match at position `0`, even though `at` starts at offset |
| 204 | /// `1` because we sliced the haystack. |
| 205 | #[inline ] |
| 206 | pub fn span<S: Into<Span>>(mut self, span: S) -> Input<'h> { |
| 207 | self.set_span(span); |
| 208 | self |
| 209 | } |
| 210 | |
| 211 | /// Like `Input::span`, but accepts any range instead. |
| 212 | /// |
| 213 | /// This routine does not panic if the range given is not a valid range for |
| 214 | /// this search's haystack. If this search is run with an invalid range, |
| 215 | /// then the most likely outcome is that the actual search execution will |
| 216 | /// panic. |
| 217 | /// |
| 218 | /// The default range is the entire haystack. |
| 219 | /// |
| 220 | /// Note that [`Input::span`] overrides this method and vice versa. |
| 221 | /// |
| 222 | /// # Panics |
| 223 | /// |
| 224 | /// This routine will panic if the given range could not be converted |
| 225 | /// to a valid [`Range`]. For example, this would panic when given |
| 226 | /// `0..=usize::MAX` since it cannot be represented using a half-open |
| 227 | /// interval in terms of `usize`. |
| 228 | /// |
| 229 | /// This also panics if the given range does not correspond to valid bounds |
| 230 | /// in the haystack or the termination of a search. |
| 231 | /// |
| 232 | /// # Example |
| 233 | /// |
| 234 | /// ``` |
| 235 | /// use regex_automata::Input; |
| 236 | /// |
| 237 | /// let input = Input::new("foobar" ); |
| 238 | /// assert_eq!(0..6, input.get_range()); |
| 239 | /// |
| 240 | /// let input = Input::new("foobar" ).range(2..=4); |
| 241 | /// assert_eq!(2..5, input.get_range()); |
| 242 | /// ``` |
| 243 | #[inline ] |
| 244 | pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h> { |
| 245 | self.set_range(range); |
| 246 | self |
| 247 | } |
| 248 | |
| 249 | /// Sets the anchor mode of a search. |
| 250 | /// |
| 251 | /// When a search is anchored (so that's [`Anchored::Yes`] or |
| 252 | /// [`Anchored::Pattern`]), a match must begin at the start of a search. |
| 253 | /// When a search is not anchored (that's [`Anchored::No`]), regex engines |
| 254 | /// will behave as if the pattern started with a `(?s-u:.)*?`. This prefix |
| 255 | /// permits a match to appear anywhere. |
| 256 | /// |
| 257 | /// By default, the anchored mode is [`Anchored::No`]. |
| 258 | /// |
| 259 | /// **WARNING:** this is subtly different than using a `^` at the start of |
| 260 | /// your regex. A `^` forces a regex to match exclusively at the start of |
| 261 | /// a haystack, regardless of where you begin your search. In contrast, |
| 262 | /// anchoring a search will allow your regex to match anywhere in your |
| 263 | /// haystack, but the match must start at the beginning of a search. |
| 264 | /// |
| 265 | /// For example, consider the haystack `aba` and the following searches: |
| 266 | /// |
| 267 | /// 1. The regex `^a` is compiled with `Anchored::No` and searches `aba` |
| 268 | /// starting at position `2`. Since `^` requires the match to start at |
| 269 | /// the beginning of the haystack and `2 > 0`, no match is found. |
| 270 | /// 2. The regex `a` is compiled with `Anchored::Yes` and searches `aba` |
| 271 | /// starting at position `2`. This reports a match at `[2, 3]` since |
| 272 | /// the match starts where the search started. Since there is no `^`, |
| 273 | /// there is no requirement for the match to start at the beginning of |
| 274 | /// the haystack. |
| 275 | /// 3. The regex `a` is compiled with `Anchored::Yes` and searches `aba` |
| 276 | /// starting at position `1`. Since `b` corresponds to position `1` and |
| 277 | /// since the search is anchored, it finds no match. While the regex |
| 278 | /// matches at other positions, configuring the search to be anchored |
| 279 | /// requires that it only report a match that begins at the same offset |
| 280 | /// as the beginning of the search. |
| 281 | /// 4. The regex `a` is compiled with `Anchored::No` and searches `aba` |
| 282 | /// starting at position `1`. Since the search is not anchored and |
| 283 | /// the regex does not start with `^`, the search executes as if there |
| 284 | /// is a `(?s:.)*?` prefix that permits it to match anywhere. Thus, it |
| 285 | /// reports a match at `[2, 3]`. |
| 286 | /// |
| 287 | /// Note that the [`Anchored::Pattern`] mode is like `Anchored::Yes`, |
| 288 | /// except it only reports matches for a particular pattern. |
| 289 | /// |
| 290 | /// # Example |
| 291 | /// |
| 292 | /// This demonstrates the differences between an anchored search and |
| 293 | /// a pattern that begins with `^` (as described in the above warning |
| 294 | /// message). |
| 295 | /// |
| 296 | /// ``` |
| 297 | /// use regex_automata::{ |
| 298 | /// nfa::thompson::pikevm::PikeVM, |
| 299 | /// Anchored, Match, Input, |
| 300 | /// }; |
| 301 | /// |
| 302 | /// let haystack = "aba" ; |
| 303 | /// |
| 304 | /// let re = PikeVM::new(r"^a" )?; |
| 305 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
| 306 | /// let input = Input::new(haystack).span(2..3).anchored(Anchored::No); |
| 307 | /// re.search(&mut cache, &input, &mut caps); |
| 308 | /// // No match is found because 2 is not the beginning of the haystack, |
| 309 | /// // which is what ^ requires. |
| 310 | /// assert_eq!(None, caps.get_match()); |
| 311 | /// |
| 312 | /// let re = PikeVM::new(r"a" )?; |
| 313 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
| 314 | /// let input = Input::new(haystack).span(2..3).anchored(Anchored::Yes); |
| 315 | /// re.search(&mut cache, &input, &mut caps); |
| 316 | /// // An anchored search can still match anywhere in the haystack, it just |
| 317 | /// // must begin at the start of the search which is '2' in this case. |
| 318 | /// assert_eq!(Some(Match::must(0, 2..3)), caps.get_match()); |
| 319 | /// |
| 320 | /// let re = PikeVM::new(r"a" )?; |
| 321 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
| 322 | /// let input = Input::new(haystack).span(1..3).anchored(Anchored::Yes); |
| 323 | /// re.search(&mut cache, &input, &mut caps); |
| 324 | /// // No match is found since we start searching at offset 1 which |
| 325 | /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match |
| 326 | /// // is found. |
| 327 | /// assert_eq!(None, caps.get_match()); |
| 328 | /// |
| 329 | /// let re = PikeVM::new(r"a" )?; |
| 330 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
| 331 | /// let input = Input::new(haystack).span(1..3).anchored(Anchored::No); |
| 332 | /// re.search(&mut cache, &input, &mut caps); |
| 333 | /// // Since anchored=no, an implicit '(?s:.)*?' prefix was added to the |
| 334 | /// // pattern. Even though the search starts at 'b', the 'match anything' |
| 335 | /// // prefix allows the search to match 'a'. |
| 336 | /// let expected = Some(Match::must(0, 2..3)); |
| 337 | /// assert_eq!(expected, caps.get_match()); |
| 338 | /// |
| 339 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 340 | /// ``` |
| 341 | #[inline ] |
| 342 | pub fn anchored(mut self, mode: Anchored) -> Input<'h> { |
| 343 | self.set_anchored(mode); |
| 344 | self |
| 345 | } |
| 346 | |
| 347 | /// Whether to execute an "earliest" search or not. |
| 348 | /// |
| 349 | /// When running a non-overlapping search, an "earliest" search will return |
| 350 | /// the match location as early as possible. For example, given a pattern |
| 351 | /// of `foo[0-9]+` and a haystack of `foo12345`, a normal leftmost search |
| 352 | /// will return `foo12345` as a match. But an "earliest" search for regex |
| 353 | /// engines that support "earliest" semantics will return `foo1` as a |
| 354 | /// match, since as soon as the first digit following `foo` is seen, it is |
| 355 | /// known to have found a match. |
| 356 | /// |
| 357 | /// Note that "earliest" semantics generally depend on the regex engine. |
| 358 | /// Different regex engines may determine there is a match at different |
| 359 | /// points. So there is no guarantee that "earliest" matches will always |
| 360 | /// return the same offsets for all regex engines. The "earliest" notion |
| 361 | /// is really about when the particular regex engine determines there is |
| 362 | /// a match rather than a consistent semantic unto itself. This is often |
| 363 | /// useful for implementing "did a match occur or not" predicates, but |
| 364 | /// sometimes the offset is useful as well. |
| 365 | /// |
| 366 | /// This is disabled by default. |
| 367 | /// |
| 368 | /// # Example |
| 369 | /// |
| 370 | /// This example shows the difference between "earliest" searching and |
| 371 | /// normal searching. |
| 372 | /// |
| 373 | /// ``` |
| 374 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input}; |
| 375 | /// |
| 376 | /// let re = PikeVM::new(r"foo[0-9]+" )?; |
| 377 | /// let mut cache = re.create_cache(); |
| 378 | /// let mut caps = re.create_captures(); |
| 379 | /// |
| 380 | /// // A normal search implements greediness like you expect. |
| 381 | /// let input = Input::new("foo12345" ); |
| 382 | /// re.search(&mut cache, &input, &mut caps); |
| 383 | /// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match()); |
| 384 | /// |
| 385 | /// // When 'earliest' is enabled and the regex engine supports |
| 386 | /// // it, the search will bail once it knows a match has been |
| 387 | /// // found. |
| 388 | /// let input = Input::new("foo12345" ).earliest(true); |
| 389 | /// re.search(&mut cache, &input, &mut caps); |
| 390 | /// assert_eq!(Some(Match::must(0, 0..4)), caps.get_match()); |
| 391 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 392 | /// ``` |
| 393 | #[inline ] |
| 394 | pub fn earliest(mut self, yes: bool) -> Input<'h> { |
| 395 | self.set_earliest(yes); |
| 396 | self |
| 397 | } |
| 398 | |
| 399 | /// Set the span for this search configuration. |
| 400 | /// |
| 401 | /// This is like the [`Input::span`] method, except this mutates the |
| 402 | /// span in place. |
| 403 | /// |
| 404 | /// This routine is generic over how a span is provided. While |
| 405 | /// a [`Span`] may be given directly, one may also provide a |
| 406 | /// `std::ops::Range<usize>`. |
| 407 | /// |
| 408 | /// # Panics |
| 409 | /// |
| 410 | /// This panics if the given span does not correspond to valid bounds in |
| 411 | /// the haystack or the termination of a search. |
| 412 | /// |
| 413 | /// # Example |
| 414 | /// |
| 415 | /// ``` |
| 416 | /// use regex_automata::Input; |
| 417 | /// |
| 418 | /// let mut input = Input::new("foobar" ); |
| 419 | /// assert_eq!(0..6, input.get_range()); |
| 420 | /// input.set_span(2..4); |
| 421 | /// assert_eq!(2..4, input.get_range()); |
| 422 | /// ``` |
| 423 | #[inline ] |
| 424 | pub fn set_span<S: Into<Span>>(&mut self, span: S) { |
| 425 | let span = span.into(); |
| 426 | assert!( |
| 427 | span.end <= self.haystack.len() |
| 428 | && span.start <= span.end.wrapping_add(1), |
| 429 | "invalid span {:?} for haystack of length {}" , |
| 430 | span, |
| 431 | self.haystack.len(), |
| 432 | ); |
| 433 | self.span = span; |
| 434 | } |
| 435 | |
| 436 | /// Set the span for this search configuration given any range. |
| 437 | /// |
| 438 | /// This is like the [`Input::range`] method, except this mutates the |
| 439 | /// span in place. |
| 440 | /// |
| 441 | /// This routine does not panic if the range given is not a valid range for |
| 442 | /// this search's haystack. If this search is run with an invalid range, |
| 443 | /// then the most likely outcome is that the actual search execution will |
| 444 | /// panic. |
| 445 | /// |
| 446 | /// # Panics |
| 447 | /// |
| 448 | /// This routine will panic if the given range could not be converted |
| 449 | /// to a valid [`Range`]. For example, this would panic when given |
| 450 | /// `0..=usize::MAX` since it cannot be represented using a half-open |
| 451 | /// interval in terms of `usize`. |
| 452 | /// |
| 453 | /// This also panics if the given span does not correspond to valid bounds |
| 454 | /// in the haystack or the termination of a search. |
| 455 | /// |
| 456 | /// # Example |
| 457 | /// |
| 458 | /// ``` |
| 459 | /// use regex_automata::Input; |
| 460 | /// |
| 461 | /// let mut input = Input::new("foobar" ); |
| 462 | /// assert_eq!(0..6, input.get_range()); |
| 463 | /// input.set_range(2..=4); |
| 464 | /// assert_eq!(2..5, input.get_range()); |
| 465 | /// ``` |
| 466 | #[inline ] |
| 467 | pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) { |
| 468 | use core::ops::Bound; |
| 469 | |
| 470 | // It's a little weird to convert ranges into spans, and then spans |
| 471 | // back into ranges when we actually slice the haystack. Because |
| 472 | // of that process, we always represent everything as a half-open |
| 473 | // internal. Therefore, handling things like m..=n is a little awkward. |
| 474 | let start = match range.start_bound() { |
| 475 | Bound::Included(&i) => i, |
| 476 | // Can this case ever happen? Range syntax doesn't support it... |
| 477 | Bound::Excluded(&i) => i.checked_add(1).unwrap(), |
| 478 | Bound::Unbounded => 0, |
| 479 | }; |
| 480 | let end = match range.end_bound() { |
| 481 | Bound::Included(&i) => i.checked_add(1).unwrap(), |
| 482 | Bound::Excluded(&i) => i, |
| 483 | Bound::Unbounded => self.haystack().len(), |
| 484 | }; |
| 485 | self.set_span(Span { start, end }); |
| 486 | } |
| 487 | |
| 488 | /// Set the starting offset for the span for this search configuration. |
| 489 | /// |
| 490 | /// This is a convenience routine for only mutating the start of a span |
| 491 | /// without having to set the entire span. |
| 492 | /// |
| 493 | /// # Panics |
| 494 | /// |
| 495 | /// This panics if the span resulting from the new start position does not |
| 496 | /// correspond to valid bounds in the haystack or the termination of a |
| 497 | /// search. |
| 498 | /// |
| 499 | /// # Example |
| 500 | /// |
| 501 | /// ``` |
| 502 | /// use regex_automata::Input; |
| 503 | /// |
| 504 | /// let mut input = Input::new("foobar" ); |
| 505 | /// assert_eq!(0..6, input.get_range()); |
| 506 | /// input.set_start(5); |
| 507 | /// assert_eq!(5..6, input.get_range()); |
| 508 | /// ``` |
| 509 | #[inline ] |
| 510 | pub fn set_start(&mut self, start: usize) { |
| 511 | self.set_span(Span { start, ..self.get_span() }); |
| 512 | } |
| 513 | |
| 514 | /// Set the ending offset for the span for this search configuration. |
| 515 | /// |
| 516 | /// This is a convenience routine for only mutating the end of a span |
| 517 | /// without having to set the entire span. |
| 518 | /// |
| 519 | /// # Panics |
| 520 | /// |
| 521 | /// This panics if the span resulting from the new end position does not |
| 522 | /// correspond to valid bounds in the haystack or the termination of a |
| 523 | /// search. |
| 524 | /// |
| 525 | /// # Example |
| 526 | /// |
| 527 | /// ``` |
| 528 | /// use regex_automata::Input; |
| 529 | /// |
| 530 | /// let mut input = Input::new("foobar" ); |
| 531 | /// assert_eq!(0..6, input.get_range()); |
| 532 | /// input.set_end(5); |
| 533 | /// assert_eq!(0..5, input.get_range()); |
| 534 | /// ``` |
| 535 | #[inline ] |
| 536 | pub fn set_end(&mut self, end: usize) { |
| 537 | self.set_span(Span { end, ..self.get_span() }); |
| 538 | } |
| 539 | |
| 540 | /// Set the anchor mode of a search. |
| 541 | /// |
| 542 | /// This is like [`Input::anchored`], except it mutates the search |
| 543 | /// configuration in place. |
| 544 | /// |
| 545 | /// # Example |
| 546 | /// |
| 547 | /// ``` |
| 548 | /// use regex_automata::{Anchored, Input, PatternID}; |
| 549 | /// |
| 550 | /// let mut input = Input::new("foobar" ); |
| 551 | /// assert_eq!(Anchored::No, input.get_anchored()); |
| 552 | /// |
| 553 | /// let pid = PatternID::must(5); |
| 554 | /// input.set_anchored(Anchored::Pattern(pid)); |
| 555 | /// assert_eq!(Anchored::Pattern(pid), input.get_anchored()); |
| 556 | /// ``` |
| 557 | #[inline ] |
| 558 | pub fn set_anchored(&mut self, mode: Anchored) { |
| 559 | self.anchored = mode; |
| 560 | } |
| 561 | |
| 562 | /// Set whether the search should execute in "earliest" mode or not. |
| 563 | /// |
| 564 | /// This is like [`Input::earliest`], except it mutates the search |
| 565 | /// configuration in place. |
| 566 | /// |
| 567 | /// # Example |
| 568 | /// |
| 569 | /// ``` |
| 570 | /// use regex_automata::Input; |
| 571 | /// |
| 572 | /// let mut input = Input::new("foobar" ); |
| 573 | /// assert!(!input.get_earliest()); |
| 574 | /// input.set_earliest(true); |
| 575 | /// assert!(input.get_earliest()); |
| 576 | /// ``` |
| 577 | #[inline ] |
| 578 | pub fn set_earliest(&mut self, yes: bool) { |
| 579 | self.earliest = yes; |
| 580 | } |
| 581 | |
| 582 | /// Return a borrow of the underlying haystack as a slice of bytes. |
| 583 | /// |
| 584 | /// # Example |
| 585 | /// |
| 586 | /// ``` |
| 587 | /// use regex_automata::Input; |
| 588 | /// |
| 589 | /// let input = Input::new("foobar" ); |
| 590 | /// assert_eq!(b"foobar" , input.haystack()); |
| 591 | /// ``` |
| 592 | #[inline ] |
| 593 | pub fn haystack(&self) -> &[u8] { |
| 594 | self.haystack |
| 595 | } |
| 596 | |
| 597 | /// Return the start position of this search. |
| 598 | /// |
| 599 | /// This is a convenience routine for `search.get_span().start()`. |
| 600 | /// |
| 601 | /// When [`Input::is_done`] is `false`, this is guaranteed to return |
| 602 | /// an offset that is less than or equal to [`Input::end`]. Otherwise, |
| 603 | /// the offset is one greater than [`Input::end`]. |
| 604 | /// |
| 605 | /// # Example |
| 606 | /// |
| 607 | /// ``` |
| 608 | /// use regex_automata::Input; |
| 609 | /// |
| 610 | /// let input = Input::new("foobar" ); |
| 611 | /// assert_eq!(0, input.start()); |
| 612 | /// |
| 613 | /// let input = Input::new("foobar" ).span(2..4); |
| 614 | /// assert_eq!(2, input.start()); |
| 615 | /// ``` |
| 616 | #[inline ] |
| 617 | pub fn start(&self) -> usize { |
| 618 | self.get_span().start |
| 619 | } |
| 620 | |
| 621 | /// Return the end position of this search. |
| 622 | /// |
| 623 | /// This is a convenience routine for `search.get_span().end()`. |
| 624 | /// |
| 625 | /// This is guaranteed to return an offset that is a valid exclusive end |
| 626 | /// bound for this input's haystack. |
| 627 | /// |
| 628 | /// # Example |
| 629 | /// |
| 630 | /// ``` |
| 631 | /// use regex_automata::Input; |
| 632 | /// |
| 633 | /// let input = Input::new("foobar" ); |
| 634 | /// assert_eq!(6, input.end()); |
| 635 | /// |
| 636 | /// let input = Input::new("foobar" ).span(2..4); |
| 637 | /// assert_eq!(4, input.end()); |
| 638 | /// ``` |
| 639 | #[inline ] |
| 640 | pub fn end(&self) -> usize { |
| 641 | self.get_span().end |
| 642 | } |
| 643 | |
| 644 | /// Return the span for this search configuration. |
| 645 | /// |
| 646 | /// If one was not explicitly set, then the span corresponds to the entire |
| 647 | /// range of the haystack. |
| 648 | /// |
| 649 | /// When [`Input::is_done`] is `false`, the span returned is guaranteed |
| 650 | /// to correspond to valid bounds for this input's haystack. |
| 651 | /// |
| 652 | /// # Example |
| 653 | /// |
| 654 | /// ``` |
| 655 | /// use regex_automata::{Input, Span}; |
| 656 | /// |
| 657 | /// let input = Input::new("foobar" ); |
| 658 | /// assert_eq!(Span { start: 0, end: 6 }, input.get_span()); |
| 659 | /// ``` |
| 660 | #[inline ] |
| 661 | pub fn get_span(&self) -> Span { |
| 662 | self.span |
| 663 | } |
| 664 | |
| 665 | /// Return the span as a range for this search configuration. |
| 666 | /// |
| 667 | /// If one was not explicitly set, then the span corresponds to the entire |
| 668 | /// range of the haystack. |
| 669 | /// |
| 670 | /// When [`Input::is_done`] is `false`, the range returned is guaranteed |
| 671 | /// to correspond to valid bounds for this input's haystack. |
| 672 | /// |
| 673 | /// # Example |
| 674 | /// |
| 675 | /// ``` |
| 676 | /// use regex_automata::Input; |
| 677 | /// |
| 678 | /// let input = Input::new("foobar" ); |
| 679 | /// assert_eq!(0..6, input.get_range()); |
| 680 | /// ``` |
| 681 | #[inline ] |
| 682 | pub fn get_range(&self) -> Range<usize> { |
| 683 | self.get_span().range() |
| 684 | } |
| 685 | |
| 686 | /// Return the anchored mode for this search configuration. |
| 687 | /// |
| 688 | /// If no anchored mode was set, then it defaults to [`Anchored::No`]. |
| 689 | /// |
| 690 | /// # Example |
| 691 | /// |
| 692 | /// ``` |
| 693 | /// use regex_automata::{Anchored, Input, PatternID}; |
| 694 | /// |
| 695 | /// let mut input = Input::new("foobar" ); |
| 696 | /// assert_eq!(Anchored::No, input.get_anchored()); |
| 697 | /// |
| 698 | /// let pid = PatternID::must(5); |
| 699 | /// input.set_anchored(Anchored::Pattern(pid)); |
| 700 | /// assert_eq!(Anchored::Pattern(pid), input.get_anchored()); |
| 701 | /// ``` |
| 702 | #[inline ] |
| 703 | pub fn get_anchored(&self) -> Anchored { |
| 704 | self.anchored |
| 705 | } |
| 706 | |
| 707 | /// Return whether this search should execute in "earliest" mode. |
| 708 | /// |
| 709 | /// # Example |
| 710 | /// |
| 711 | /// ``` |
| 712 | /// use regex_automata::Input; |
| 713 | /// |
| 714 | /// let input = Input::new("foobar" ); |
| 715 | /// assert!(!input.get_earliest()); |
| 716 | /// ``` |
| 717 | #[inline ] |
| 718 | pub fn get_earliest(&self) -> bool { |
| 719 | self.earliest |
| 720 | } |
| 721 | |
| 722 | /// Return true if and only if this search can never return any other |
| 723 | /// matches. |
| 724 | /// |
| 725 | /// This occurs when the start position of this search is greater than the |
| 726 | /// end position of the search. |
| 727 | /// |
| 728 | /// # Example |
| 729 | /// |
| 730 | /// ``` |
| 731 | /// use regex_automata::Input; |
| 732 | /// |
| 733 | /// let mut input = Input::new("foobar" ); |
| 734 | /// assert!(!input.is_done()); |
| 735 | /// input.set_start(6); |
| 736 | /// assert!(!input.is_done()); |
| 737 | /// input.set_start(7); |
| 738 | /// assert!(input.is_done()); |
| 739 | /// ``` |
| 740 | #[inline ] |
| 741 | pub fn is_done(&self) -> bool { |
| 742 | self.get_span().start > self.get_span().end |
| 743 | } |
| 744 | |
| 745 | /// Returns true if and only if the given offset in this search's haystack |
| 746 | /// falls on a valid UTF-8 encoded codepoint boundary. |
| 747 | /// |
| 748 | /// If the haystack is not valid UTF-8, then the behavior of this routine |
| 749 | /// is unspecified. |
| 750 | /// |
| 751 | /// # Example |
| 752 | /// |
| 753 | /// This shows where codepoint boundaries do and don't exist in valid |
| 754 | /// UTF-8. |
| 755 | /// |
| 756 | /// ``` |
| 757 | /// use regex_automata::Input; |
| 758 | /// |
| 759 | /// let input = Input::new("☃" ); |
| 760 | /// assert!(input.is_char_boundary(0)); |
| 761 | /// assert!(!input.is_char_boundary(1)); |
| 762 | /// assert!(!input.is_char_boundary(2)); |
| 763 | /// assert!(input.is_char_boundary(3)); |
| 764 | /// assert!(!input.is_char_boundary(4)); |
| 765 | /// ``` |
| 766 | #[inline ] |
| 767 | pub fn is_char_boundary(&self, offset: usize) -> bool { |
| 768 | utf8::is_boundary(self.haystack(), offset) |
| 769 | } |
| 770 | } |
| 771 | |
| 772 | impl<'h> core::fmt::Debug for Input<'h> { |
| 773 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
| 774 | use crate::util::escape::DebugHaystack; |
| 775 | |
| 776 | f&mut DebugStruct<'_, '_>.debug_struct("Input" ) |
| 777 | .field("haystack" , &DebugHaystack(self.haystack())) |
| 778 | .field("span" , &self.span) |
| 779 | .field("anchored" , &self.anchored) |
| 780 | .field(name:"earliest" , &self.earliest) |
| 781 | .finish() |
| 782 | } |
| 783 | } |
| 784 | |
| 785 | impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> { |
| 786 | fn from(haystack: &'h H) -> Input<'h> { |
| 787 | Input::new(haystack) |
| 788 | } |
| 789 | } |
| 790 | |
| 791 | /// A representation of a span reported by a regex engine. |
| 792 | /// |
| 793 | /// A span corresponds to the starting and ending _byte offsets_ of a |
| 794 | /// contiguous region of bytes. The starting offset is inclusive while the |
| 795 | /// ending offset is exclusive. That is, a span is a half-open interval. |
| 796 | /// |
| 797 | /// A span is used to report the offsets of a match, but it is also used to |
| 798 | /// convey which region of a haystack should be searched via routines like |
| 799 | /// [`Input::span`]. |
| 800 | /// |
| 801 | /// This is basically equivalent to a `std::ops::Range<usize>`, except this |
| 802 | /// type implements `Copy` which makes it more ergonomic to use in the context |
| 803 | /// of this crate. Like a range, this implements `Index` for `[u8]` and `str`, |
| 804 | /// and `IndexMut` for `[u8]`. For convenience, this also impls `From<Range>`, |
| 805 | /// which means things like `Span::from(5..10)` work. |
| 806 | #[derive (Clone, Copy, Eq, Hash, PartialEq)] |
| 807 | pub struct Span { |
| 808 | /// The start offset of the span, inclusive. |
| 809 | pub start: usize, |
| 810 | /// The end offset of the span, exclusive. |
| 811 | pub end: usize, |
| 812 | } |
| 813 | |
| 814 | impl Span { |
| 815 | /// Returns this span as a range. |
| 816 | #[inline ] |
| 817 | pub fn range(&self) -> Range<usize> { |
| 818 | Range::from(*self) |
| 819 | } |
| 820 | |
| 821 | /// Returns true when this span is empty. That is, when `start >= end`. |
| 822 | #[inline ] |
| 823 | pub fn is_empty(&self) -> bool { |
| 824 | self.start >= self.end |
| 825 | } |
| 826 | |
| 827 | /// Returns the length of this span. |
| 828 | /// |
| 829 | /// This returns `0` in precisely the cases that `is_empty` returns `true`. |
| 830 | #[inline ] |
| 831 | pub fn len(&self) -> usize { |
| 832 | self.end.saturating_sub(self.start) |
| 833 | } |
| 834 | |
| 835 | /// Returns true when the given offset is contained within this span. |
| 836 | /// |
| 837 | /// Note that an empty span contains no offsets and will always return |
| 838 | /// false. |
| 839 | #[inline ] |
| 840 | pub fn contains(&self, offset: usize) -> bool { |
| 841 | !self.is_empty() && self.start <= offset && offset <= self.end |
| 842 | } |
| 843 | |
| 844 | /// Returns a new span with `offset` added to this span's `start` and `end` |
| 845 | /// values. |
| 846 | #[inline ] |
| 847 | pub fn offset(&self, offset: usize) -> Span { |
| 848 | Span { start: self.start + offset, end: self.end + offset } |
| 849 | } |
| 850 | } |
| 851 | |
| 852 | impl core::fmt::Debug for Span { |
| 853 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
| 854 | write!(f, " {}.. {}" , self.start, self.end) |
| 855 | } |
| 856 | } |
| 857 | |
| 858 | impl core::ops::Index<Span> for [u8] { |
| 859 | type Output = [u8]; |
| 860 | |
| 861 | #[inline ] |
| 862 | fn index(&self, index: Span) -> &[u8] { |
| 863 | &self[index.range()] |
| 864 | } |
| 865 | } |
| 866 | |
| 867 | impl core::ops::IndexMut<Span> for [u8] { |
| 868 | #[inline ] |
| 869 | fn index_mut(&mut self, index: Span) -> &mut [u8] { |
| 870 | &mut self[index.range()] |
| 871 | } |
| 872 | } |
| 873 | |
| 874 | impl core::ops::Index<Span> for str { |
| 875 | type Output = str; |
| 876 | |
| 877 | #[inline ] |
| 878 | fn index(&self, index: Span) -> &str { |
| 879 | &self[index.range()] |
| 880 | } |
| 881 | } |
| 882 | |
| 883 | impl From<Range<usize>> for Span { |
| 884 | #[inline ] |
| 885 | fn from(range: Range<usize>) -> Span { |
| 886 | Span { start: range.start, end: range.end } |
| 887 | } |
| 888 | } |
| 889 | |
| 890 | impl From<Span> for Range<usize> { |
| 891 | #[inline ] |
| 892 | fn from(span: Span) -> Range<usize> { |
| 893 | Range { start: span.start, end: span.end } |
| 894 | } |
| 895 | } |
| 896 | |
| 897 | impl PartialEq<Range<usize>> for Span { |
| 898 | #[inline ] |
| 899 | fn eq(&self, range: &Range<usize>) -> bool { |
| 900 | self.start == range.start && self.end == range.end |
| 901 | } |
| 902 | } |
| 903 | |
| 904 | impl PartialEq<Span> for Range<usize> { |
| 905 | #[inline ] |
| 906 | fn eq(&self, span: &Span) -> bool { |
| 907 | self.start == span.start && self.end == span.end |
| 908 | } |
| 909 | } |
| 910 | |
| 911 | /// A representation of "half" of a match reported by a DFA. |
| 912 | /// |
| 913 | /// This is called a "half" match because it only includes the end location (or |
| 914 | /// start location for a reverse search) of a match. This corresponds to the |
| 915 | /// information that a single DFA scan can report. Getting the other half of |
| 916 | /// the match requires a second scan with a reversed DFA. |
| 917 | /// |
| 918 | /// A half match also includes the pattern that matched. The pattern is |
| 919 | /// identified by an ID, which corresponds to its position (starting from `0`) |
| 920 | /// relative to other patterns used to construct the corresponding DFA. If only |
| 921 | /// a single pattern is provided to the DFA, then all matches are guaranteed to |
| 922 | /// have a pattern ID of `0`. |
| 923 | #[derive (Clone, Copy, Debug, Eq, Hash, PartialEq)] |
| 924 | pub struct HalfMatch { |
| 925 | /// The pattern ID. |
| 926 | pattern: PatternID, |
| 927 | /// The offset of the match. |
| 928 | /// |
| 929 | /// For forward searches, the offset is exclusive. For reverse searches, |
| 930 | /// the offset is inclusive. |
| 931 | offset: usize, |
| 932 | } |
| 933 | |
| 934 | impl HalfMatch { |
| 935 | /// Create a new half match from a pattern ID and a byte offset. |
| 936 | #[inline ] |
| 937 | pub fn new(pattern: PatternID, offset: usize) -> HalfMatch { |
| 938 | HalfMatch { pattern, offset } |
| 939 | } |
| 940 | |
| 941 | /// Create a new half match from a pattern ID and a byte offset. |
| 942 | /// |
| 943 | /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a |
| 944 | /// [`PatternID`]. This panics if the given `usize` is not representable |
| 945 | /// as a `PatternID`. |
| 946 | #[inline ] |
| 947 | pub fn must(pattern: usize, offset: usize) -> HalfMatch { |
| 948 | HalfMatch::new(PatternID::new(pattern).unwrap(), offset) |
| 949 | } |
| 950 | |
| 951 | /// Returns the ID of the pattern that matched. |
| 952 | /// |
| 953 | /// The ID of a pattern is derived from the position in which it was |
| 954 | /// originally inserted into the corresponding DFA. The first pattern has |
| 955 | /// identifier `0`, and each subsequent pattern is `1`, `2` and so on. |
| 956 | #[inline ] |
| 957 | pub fn pattern(&self) -> PatternID { |
| 958 | self.pattern |
| 959 | } |
| 960 | |
| 961 | /// The position of the match. |
| 962 | /// |
| 963 | /// If this match was produced by a forward search, then the offset is |
| 964 | /// exclusive. If this match was produced by a reverse search, then the |
| 965 | /// offset is inclusive. |
| 966 | #[inline ] |
| 967 | pub fn offset(&self) -> usize { |
| 968 | self.offset |
| 969 | } |
| 970 | } |
| 971 | |
| 972 | /// A representation of a match reported by a regex engine. |
| 973 | /// |
| 974 | /// A match has two essential pieces of information: the [`PatternID`] that |
| 975 | /// matches, and the [`Span`] of the match in a haystack. |
| 976 | /// |
| 977 | /// The pattern is identified by an ID, which corresponds to its position |
| 978 | /// (starting from `0`) relative to other patterns used to construct the |
| 979 | /// corresponding regex engine. If only a single pattern is provided, then all |
| 980 | /// matches are guaranteed to have a pattern ID of `0`. |
| 981 | /// |
| 982 | /// Every match reported by a regex engine guarantees that its span has its |
| 983 | /// start offset as less than or equal to its end offset. |
| 984 | #[derive (Clone, Copy, Debug, Eq, Hash, PartialEq)] |
| 985 | pub struct Match { |
| 986 | /// The pattern ID. |
| 987 | pattern: PatternID, |
| 988 | /// The underlying match span. |
| 989 | span: Span, |
| 990 | } |
| 991 | |
| 992 | impl Match { |
| 993 | /// Create a new match from a pattern ID and a span. |
| 994 | /// |
| 995 | /// This constructor is generic over how a span is provided. While |
| 996 | /// a [`Span`] may be given directly, one may also provide a |
| 997 | /// `std::ops::Range<usize>`. |
| 998 | /// |
| 999 | /// # Panics |
| 1000 | /// |
| 1001 | /// This panics if `end < start`. |
| 1002 | /// |
| 1003 | /// # Example |
| 1004 | /// |
| 1005 | /// This shows how to create a match for the first pattern in a regex |
| 1006 | /// object using convenient range syntax. |
| 1007 | /// |
| 1008 | /// ``` |
| 1009 | /// use regex_automata::{Match, PatternID}; |
| 1010 | /// |
| 1011 | /// let m = Match::new(PatternID::ZERO, 5..10); |
| 1012 | /// assert_eq!(0, m.pattern().as_usize()); |
| 1013 | /// assert_eq!(5, m.start()); |
| 1014 | /// assert_eq!(10, m.end()); |
| 1015 | /// ``` |
| 1016 | #[inline ] |
| 1017 | pub fn new<S: Into<Span>>(pattern: PatternID, span: S) -> Match { |
| 1018 | let span: Span = span.into(); |
| 1019 | assert!(span.start <= span.end, "invalid match span" ); |
| 1020 | Match { pattern, span } |
| 1021 | } |
| 1022 | |
| 1023 | /// Create a new match from a pattern ID and a byte offset span. |
| 1024 | /// |
| 1025 | /// This constructor is generic over how a span is provided. While |
| 1026 | /// a [`Span`] may be given directly, one may also provide a |
| 1027 | /// `std::ops::Range<usize>`. |
| 1028 | /// |
| 1029 | /// This is like [`Match::new`], but accepts a `usize` instead of a |
| 1030 | /// [`PatternID`]. This panics if the given `usize` is not representable |
| 1031 | /// as a `PatternID`. |
| 1032 | /// |
| 1033 | /// # Panics |
| 1034 | /// |
| 1035 | /// This panics if `end < start` or if `pattern > PatternID::MAX`. |
| 1036 | /// |
| 1037 | /// # Example |
| 1038 | /// |
| 1039 | /// This shows how to create a match for the third pattern in a regex |
| 1040 | /// object using convenient range syntax. |
| 1041 | /// |
| 1042 | /// ``` |
| 1043 | /// use regex_automata::Match; |
| 1044 | /// |
| 1045 | /// let m = Match::must(3, 5..10); |
| 1046 | /// assert_eq!(3, m.pattern().as_usize()); |
| 1047 | /// assert_eq!(5, m.start()); |
| 1048 | /// assert_eq!(10, m.end()); |
| 1049 | /// ``` |
| 1050 | #[inline ] |
| 1051 | pub fn must<S: Into<Span>>(pattern: usize, span: S) -> Match { |
| 1052 | Match::new(PatternID::must(pattern), span) |
| 1053 | } |
| 1054 | |
| 1055 | /// Returns the ID of the pattern that matched. |
| 1056 | /// |
| 1057 | /// The ID of a pattern is derived from the position in which it was |
| 1058 | /// originally inserted into the corresponding regex engine. The first |
| 1059 | /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and |
| 1060 | /// so on. |
| 1061 | #[inline ] |
| 1062 | pub fn pattern(&self) -> PatternID { |
| 1063 | self.pattern |
| 1064 | } |
| 1065 | |
| 1066 | /// The starting position of the match. |
| 1067 | /// |
| 1068 | /// This is a convenience routine for `Match::span().start`. |
| 1069 | #[inline ] |
| 1070 | pub fn start(&self) -> usize { |
| 1071 | self.span().start |
| 1072 | } |
| 1073 | |
| 1074 | /// The ending position of the match. |
| 1075 | /// |
| 1076 | /// This is a convenience routine for `Match::span().end`. |
| 1077 | #[inline ] |
| 1078 | pub fn end(&self) -> usize { |
| 1079 | self.span().end |
| 1080 | } |
| 1081 | |
| 1082 | /// Returns the match span as a range. |
| 1083 | /// |
| 1084 | /// This is a convenience routine for `Match::span().range()`. |
| 1085 | #[inline ] |
| 1086 | pub fn range(&self) -> core::ops::Range<usize> { |
| 1087 | self.span().range() |
| 1088 | } |
| 1089 | |
| 1090 | /// Returns the span for this match. |
| 1091 | #[inline ] |
| 1092 | pub fn span(&self) -> Span { |
| 1093 | self.span |
| 1094 | } |
| 1095 | |
| 1096 | /// Returns true when the span in this match is empty. |
| 1097 | /// |
| 1098 | /// An empty match can only be returned when the regex itself can match |
| 1099 | /// the empty string. |
| 1100 | #[inline ] |
| 1101 | pub fn is_empty(&self) -> bool { |
| 1102 | self.span().is_empty() |
| 1103 | } |
| 1104 | |
| 1105 | /// Returns the length of this match. |
| 1106 | /// |
| 1107 | /// This returns `0` in precisely the cases that `is_empty` returns `true`. |
| 1108 | #[inline ] |
| 1109 | pub fn len(&self) -> usize { |
| 1110 | self.span().len() |
| 1111 | } |
| 1112 | } |
| 1113 | |
| 1114 | /// A set of `PatternID`s. |
| 1115 | /// |
| 1116 | /// A set of pattern identifiers is useful for recording which patterns have |
| 1117 | /// matched a particular haystack. A pattern set _only_ includes pattern |
| 1118 | /// identifiers. It does not include offset information. |
| 1119 | /// |
| 1120 | /// # Example |
| 1121 | /// |
| 1122 | /// This shows basic usage of a set. |
| 1123 | /// |
| 1124 | /// ``` |
| 1125 | /// use regex_automata::{PatternID, PatternSet}; |
| 1126 | /// |
| 1127 | /// let pid1 = PatternID::must(5); |
| 1128 | /// let pid2 = PatternID::must(8); |
| 1129 | /// // Create a new empty set. |
| 1130 | /// let mut set = PatternSet::new(10); |
| 1131 | /// // Insert pattern IDs. |
| 1132 | /// set.insert(pid1); |
| 1133 | /// set.insert(pid2); |
| 1134 | /// // Test membership. |
| 1135 | /// assert!(set.contains(pid1)); |
| 1136 | /// assert!(set.contains(pid2)); |
| 1137 | /// // Get all members. |
| 1138 | /// assert_eq!( |
| 1139 | /// vec![5, 8], |
| 1140 | /// set.iter().map(|p| p.as_usize()).collect::<Vec<usize>>(), |
| 1141 | /// ); |
| 1142 | /// // Clear the set. |
| 1143 | /// set.clear(); |
| 1144 | /// // Test that it is indeed empty. |
| 1145 | /// assert!(set.is_empty()); |
| 1146 | /// ``` |
| 1147 | #[cfg (feature = "alloc" )] |
| 1148 | #[derive (Clone, Debug, Eq, PartialEq)] |
| 1149 | pub struct PatternSet { |
| 1150 | /// The number of patterns set to 'true' in this set. |
| 1151 | len: usize, |
| 1152 | /// A map from PatternID to boolean of whether a pattern matches or not. |
| 1153 | /// |
| 1154 | /// This should probably be a bitset, but it's probably unlikely to matter |
| 1155 | /// much in practice. |
| 1156 | /// |
| 1157 | /// The main downside of this representation (and similarly for a bitset) |
| 1158 | /// is that iteration scales with the capacity of the set instead of |
| 1159 | /// the length of the set. This doesn't seem likely to be a problem in |
| 1160 | /// practice. |
| 1161 | /// |
| 1162 | /// Another alternative is to just use a 'SparseSet' for this. It does use |
| 1163 | /// more memory (quite a bit more), but that seems fine I think compared |
| 1164 | /// to the memory being used by the regex engine. The real hiccup with |
| 1165 | /// it is that it yields pattern IDs in the order they were inserted. |
| 1166 | /// Which is actually kind of nice, but at the time of writing, pattern |
| 1167 | /// IDs are yielded in ascending order in the regex crate RegexSet API. |
| 1168 | /// If we did change to 'SparseSet', we could provide an additional |
| 1169 | /// 'iter_match_order' iterator, but keep the ascending order one for |
| 1170 | /// compatibility. |
| 1171 | which: alloc::boxed::Box<[bool]>, |
| 1172 | } |
| 1173 | |
| 1174 | #[cfg (feature = "alloc" )] |
| 1175 | impl PatternSet { |
| 1176 | /// Create a new set of pattern identifiers with the given capacity. |
| 1177 | /// |
| 1178 | /// The given capacity typically corresponds to (at least) the number of |
| 1179 | /// patterns in a compiled regex object. |
| 1180 | /// |
| 1181 | /// # Panics |
| 1182 | /// |
| 1183 | /// This panics if the given capacity exceeds [`PatternID::LIMIT`]. This is |
| 1184 | /// impossible if you use the `pattern_len()` method as defined on any of |
| 1185 | /// the regex engines in this crate. Namely, a regex will fail to build by |
| 1186 | /// returning an error if the number of patterns given to it exceeds the |
| 1187 | /// limit. Therefore, the number of patterns in a valid regex is always |
| 1188 | /// a correct capacity to provide here. |
| 1189 | pub fn new(capacity: usize) -> PatternSet { |
| 1190 | assert!( |
| 1191 | capacity <= PatternID::LIMIT, |
| 1192 | "pattern set capacity exceeds limit of {}" , |
| 1193 | PatternID::LIMIT, |
| 1194 | ); |
| 1195 | PatternSet { |
| 1196 | len: 0, |
| 1197 | which: alloc::vec![false; capacity].into_boxed_slice(), |
| 1198 | } |
| 1199 | } |
| 1200 | |
| 1201 | /// Clear this set such that it contains no pattern IDs. |
| 1202 | pub fn clear(&mut self) { |
| 1203 | self.len = 0; |
| 1204 | for matched in self.which.iter_mut() { |
| 1205 | *matched = false; |
| 1206 | } |
| 1207 | } |
| 1208 | |
| 1209 | /// Return true if and only if the given pattern identifier is in this set. |
| 1210 | pub fn contains(&self, pid: PatternID) -> bool { |
| 1211 | pid.as_usize() < self.capacity() && self.which[pid] |
| 1212 | } |
| 1213 | |
| 1214 | /// Insert the given pattern identifier into this set and return `true` if |
| 1215 | /// the given pattern ID was not previously in this set. |
| 1216 | /// |
| 1217 | /// If the pattern identifier is already in this set, then this is a no-op. |
| 1218 | /// |
| 1219 | /// Use [`PatternSet::try_insert`] for a fallible version of this routine. |
| 1220 | /// |
| 1221 | /// # Panics |
| 1222 | /// |
| 1223 | /// This panics if this pattern set has insufficient capacity to |
| 1224 | /// store the given pattern ID. |
| 1225 | pub fn insert(&mut self, pid: PatternID) -> bool { |
| 1226 | self.try_insert(pid) |
| 1227 | .expect("PatternSet should have sufficient capacity" ) |
| 1228 | } |
| 1229 | |
| 1230 | /// Insert the given pattern identifier into this set and return `true` if |
| 1231 | /// the given pattern ID was not previously in this set. |
| 1232 | /// |
| 1233 | /// If the pattern identifier is already in this set, then this is a no-op. |
| 1234 | /// |
| 1235 | /// # Errors |
| 1236 | /// |
| 1237 | /// This returns an error if this pattern set has insufficient capacity to |
| 1238 | /// store the given pattern ID. |
| 1239 | pub fn try_insert( |
| 1240 | &mut self, |
| 1241 | pid: PatternID, |
| 1242 | ) -> Result<bool, PatternSetInsertError> { |
| 1243 | if pid.as_usize() >= self.capacity() { |
| 1244 | return Err(PatternSetInsertError { |
| 1245 | attempted: pid, |
| 1246 | capacity: self.capacity(), |
| 1247 | }); |
| 1248 | } |
| 1249 | if self.which[pid] { |
| 1250 | return Ok(false); |
| 1251 | } |
| 1252 | self.len += 1; |
| 1253 | self.which[pid] = true; |
| 1254 | Ok(true) |
| 1255 | } |
| 1256 | |
| 1257 | /* |
| 1258 | // This is currently commented out because it is unused and it is unclear |
| 1259 | // whether it's useful or not. What's the harm in having it? When, if |
| 1260 | // we ever wanted to change our representation to a 'SparseSet', then |
| 1261 | // supporting this method would be a bit tricky. So in order to keep some |
| 1262 | // API evolution flexibility, we leave it out for now. |
| 1263 | |
| 1264 | /// Remove the given pattern identifier from this set. |
| 1265 | /// |
| 1266 | /// If the pattern identifier was not previously in this set, then this |
| 1267 | /// does not change the set and returns `false`. |
| 1268 | /// |
| 1269 | /// # Panics |
| 1270 | /// |
| 1271 | /// This panics if `pid` exceeds the capacity of this set. |
| 1272 | pub fn remove(&mut self, pid: PatternID) -> bool { |
| 1273 | if !self.which[pid] { |
| 1274 | return false; |
| 1275 | } |
| 1276 | self.len -= 1; |
| 1277 | self.which[pid] = false; |
| 1278 | true |
| 1279 | } |
| 1280 | */ |
| 1281 | |
| 1282 | /// Return true if and only if this set has no pattern identifiers in it. |
| 1283 | pub fn is_empty(&self) -> bool { |
| 1284 | self.len() == 0 |
| 1285 | } |
| 1286 | |
| 1287 | /// Return true if and only if this set has the maximum number of pattern |
| 1288 | /// identifiers in the set. This occurs precisely when `PatternSet::len() |
| 1289 | /// == PatternSet::capacity()`. |
| 1290 | /// |
| 1291 | /// This particular property is useful to test because it may allow one to |
| 1292 | /// stop a search earlier than you might otherwise. Namely, if a search is |
| 1293 | /// only reporting which patterns match a haystack and if you know all of |
| 1294 | /// the patterns match at a given point, then there's no new information |
| 1295 | /// that can be learned by continuing the search. (Because a pattern set |
| 1296 | /// does not keep track of offset information.) |
| 1297 | pub fn is_full(&self) -> bool { |
| 1298 | self.len() == self.capacity() |
| 1299 | } |
| 1300 | |
| 1301 | /// Returns the total number of pattern identifiers in this set. |
| 1302 | pub fn len(&self) -> usize { |
| 1303 | self.len |
| 1304 | } |
| 1305 | |
| 1306 | /// Returns the total number of pattern identifiers that may be stored |
| 1307 | /// in this set. |
| 1308 | /// |
| 1309 | /// This is guaranteed to be less than or equal to [`PatternID::LIMIT`]. |
| 1310 | /// |
| 1311 | /// Typically, the capacity of a pattern set matches the number of patterns |
| 1312 | /// in a regex object with which you are searching. |
| 1313 | pub fn capacity(&self) -> usize { |
| 1314 | self.which.len() |
| 1315 | } |
| 1316 | |
| 1317 | /// Returns an iterator over all pattern identifiers in this set. |
| 1318 | /// |
| 1319 | /// The iterator yields pattern identifiers in ascending order, starting |
| 1320 | /// at zero. |
| 1321 | pub fn iter(&self) -> PatternSetIter<'_> { |
| 1322 | PatternSetIter { it: self.which.iter().enumerate() } |
| 1323 | } |
| 1324 | } |
| 1325 | |
| 1326 | /// An error that occurs when a `PatternID` failed to insert into a |
| 1327 | /// `PatternSet`. |
| 1328 | /// |
| 1329 | /// An insert fails when the given `PatternID` exceeds the configured capacity |
| 1330 | /// of the `PatternSet`. |
| 1331 | /// |
| 1332 | /// This error is created by the [`PatternSet::try_insert`] routine. |
| 1333 | #[cfg (feature = "alloc" )] |
| 1334 | #[derive (Clone, Debug)] |
| 1335 | pub struct PatternSetInsertError { |
| 1336 | attempted: PatternID, |
| 1337 | capacity: usize, |
| 1338 | } |
| 1339 | |
| 1340 | #[cfg (feature = "std" )] |
| 1341 | impl std::error::Error for PatternSetInsertError {} |
| 1342 | |
| 1343 | #[cfg (feature = "alloc" )] |
| 1344 | impl core::fmt::Display for PatternSetInsertError { |
| 1345 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
| 1346 | write!( |
| 1347 | f, |
| 1348 | "failed to insert pattern ID {} into pattern set \ |
| 1349 | with insufficiet capacity of {}" , |
| 1350 | self.attempted.as_usize(), |
| 1351 | self.capacity, |
| 1352 | ) |
| 1353 | } |
| 1354 | } |
| 1355 | |
| 1356 | /// An iterator over all pattern identifiers in a [`PatternSet`]. |
| 1357 | /// |
| 1358 | /// The lifetime parameter `'a` refers to the lifetime of the pattern set being |
| 1359 | /// iterated over. |
| 1360 | /// |
| 1361 | /// This iterator is created by the [`PatternSet::iter`] method. |
| 1362 | #[cfg (feature = "alloc" )] |
| 1363 | #[derive (Clone, Debug)] |
| 1364 | pub struct PatternSetIter<'a> { |
| 1365 | it: core::iter::Enumerate<core::slice::Iter<'a, bool>>, |
| 1366 | } |
| 1367 | |
| 1368 | #[cfg (feature = "alloc" )] |
| 1369 | impl<'a> Iterator for PatternSetIter<'a> { |
| 1370 | type Item = PatternID; |
| 1371 | |
| 1372 | fn next(&mut self) -> Option<PatternID> { |
| 1373 | while let Some((index: usize, &yes: bool)) = self.it.next() { |
| 1374 | if yes { |
| 1375 | // Only valid 'PatternID' values can be inserted into the set |
| 1376 | // and construction of the set panics if the capacity would |
| 1377 | // permit storing invalid pattern IDs. Thus, 'yes' is only true |
| 1378 | // precisely when 'index' corresponds to a valid 'PatternID'. |
| 1379 | return Some(PatternID::new_unchecked(index)); |
| 1380 | } |
| 1381 | } |
| 1382 | None |
| 1383 | } |
| 1384 | |
| 1385 | fn size_hint(&self) -> (usize, Option<usize>) { |
| 1386 | self.it.size_hint() |
| 1387 | } |
| 1388 | } |
| 1389 | |
| 1390 | #[cfg (feature = "alloc" )] |
| 1391 | impl<'a> DoubleEndedIterator for PatternSetIter<'a> { |
| 1392 | fn next_back(&mut self) -> Option<PatternID> { |
| 1393 | while let Some((index: usize, &yes: bool)) = self.it.next_back() { |
| 1394 | if yes { |
| 1395 | // Only valid 'PatternID' values can be inserted into the set |
| 1396 | // and construction of the set panics if the capacity would |
| 1397 | // permit storing invalid pattern IDs. Thus, 'yes' is only true |
| 1398 | // precisely when 'index' corresponds to a valid 'PatternID'. |
| 1399 | return Some(PatternID::new_unchecked(index)); |
| 1400 | } |
| 1401 | } |
| 1402 | None |
| 1403 | } |
| 1404 | } |
| 1405 | |
| 1406 | /// The type of anchored search to perform. |
| 1407 | /// |
| 1408 | /// This is *almost* a boolean option. That is, you can either do an unanchored |
| 1409 | /// search for any pattern in a regex, or you can do an anchored search for any |
| 1410 | /// pattern in a regex. |
| 1411 | /// |
| 1412 | /// A third option exists that, assuming the regex engine supports it, permits |
| 1413 | /// you to do an anchored search for a specific pattern. |
| 1414 | /// |
| 1415 | /// Note that there is no way to run an unanchored search for a specific |
| 1416 | /// pattern. If you need that, you'll need to build separate regexes for each |
| 1417 | /// pattern. |
| 1418 | /// |
| 1419 | /// # Errors |
| 1420 | /// |
| 1421 | /// If a regex engine does not support the anchored mode selected, then the |
| 1422 | /// regex engine will return an error. While any non-trivial regex engine |
| 1423 | /// should support at least one of the available anchored modes, there is no |
| 1424 | /// singular mode that is guaranteed to be universally supported. Some regex |
| 1425 | /// engines might only support unanchored searches (DFAs compiled without |
| 1426 | /// anchored starting states) and some regex engines might only support |
| 1427 | /// anchored searches (like the one-pass DFA). |
| 1428 | /// |
| 1429 | /// The specific error returned is a [`MatchError`] with a |
| 1430 | /// [`MatchErrorKind::UnsupportedAnchored`] kind. The kind includes the |
| 1431 | /// `Anchored` value given that is unsupported. |
| 1432 | /// |
| 1433 | /// Note that regex engines should report "no match" if, for example, an |
| 1434 | /// `Anchored::Pattern` is provided with an invalid pattern ID _but_ where |
| 1435 | /// anchored searches for a specific pattern are supported. This is smooths out |
| 1436 | /// behavior such that it's possible to guarantee that an error never occurs |
| 1437 | /// based on how the regex engine is configured. All regex engines in this |
| 1438 | /// crate report "no match" when searching for an invalid pattern ID, but where |
| 1439 | /// searching for a valid pattern ID is otherwise supported. |
| 1440 | /// |
| 1441 | /// # Example |
| 1442 | /// |
| 1443 | /// This example shows how to use the various `Anchored` modes to run a |
| 1444 | /// search. We use the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) |
| 1445 | /// because it supports all modes unconditionally. Some regex engines, like |
| 1446 | /// the [`onepass::DFA`](crate::dfa::onepass::DFA) cannot support unanchored |
| 1447 | /// searches. |
| 1448 | /// |
| 1449 | /// ``` |
| 1450 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 1451 | /// use regex_automata::{ |
| 1452 | /// nfa::thompson::pikevm::PikeVM, |
| 1453 | /// Anchored, Input, Match, PatternID, |
| 1454 | /// }; |
| 1455 | /// |
| 1456 | /// let re = PikeVM::new_many(&[ |
| 1457 | /// r"Mrs. \w+" , |
| 1458 | /// r"Miss \w+" , |
| 1459 | /// r"Mr. \w+" , |
| 1460 | /// r"Ms. \w+" , |
| 1461 | /// ])?; |
| 1462 | /// let mut cache = re.create_cache(); |
| 1463 | /// let hay = "Hello Mr. Springsteen!" ; |
| 1464 | /// |
| 1465 | /// // The default is to do an unanchored search. |
| 1466 | /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay)); |
| 1467 | /// // Explicitly ask for an unanchored search. Same as above. |
| 1468 | /// let input = Input::new(hay).anchored(Anchored::No); |
| 1469 | /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay)); |
| 1470 | /// |
| 1471 | /// // Now try an anchored search. Since the match doesn't start at the |
| 1472 | /// // beginning of the haystack, no match is found! |
| 1473 | /// let input = Input::new(hay).anchored(Anchored::Yes); |
| 1474 | /// assert_eq!(None, re.find(&mut cache, input)); |
| 1475 | /// |
| 1476 | /// // We can try an anchored search again, but move the location of where |
| 1477 | /// // we start the search. Note that the offsets reported are still in |
| 1478 | /// // terms of the overall haystack and not relative to where we started |
| 1479 | /// // the search. |
| 1480 | /// let input = Input::new(hay).anchored(Anchored::Yes).range(6..); |
| 1481 | /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input)); |
| 1482 | /// |
| 1483 | /// // Now try an anchored search for a specific pattern. We specifically |
| 1484 | /// // choose a pattern that we know doesn't match to prove that the search |
| 1485 | /// // only looks for the pattern we provide. |
| 1486 | /// let input = Input::new(hay) |
| 1487 | /// .anchored(Anchored::Pattern(PatternID::must(1))) |
| 1488 | /// .range(6..); |
| 1489 | /// assert_eq!(None, re.find(&mut cache, input)); |
| 1490 | /// |
| 1491 | /// // But if we switch it to the pattern that we know matches, then we find |
| 1492 | /// // the match. |
| 1493 | /// let input = Input::new(hay) |
| 1494 | /// .anchored(Anchored::Pattern(PatternID::must(2))) |
| 1495 | /// .range(6..); |
| 1496 | /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input)); |
| 1497 | /// |
| 1498 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 1499 | /// ``` |
| 1500 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
| 1501 | pub enum Anchored { |
| 1502 | /// Run an unanchored search. This means a match may occur anywhere at or |
| 1503 | /// after the start position of the search. |
| 1504 | /// |
| 1505 | /// This search can return a match for any pattern in the regex. |
| 1506 | No, |
| 1507 | /// Run an anchored search. This means that a match must begin at the |
| 1508 | /// start position of the search. |
| 1509 | /// |
| 1510 | /// This search can return a match for any pattern in the regex. |
| 1511 | Yes, |
| 1512 | /// Run an anchored search for a specific pattern. This means that a match |
| 1513 | /// must be for the given pattern and must begin at the start position of |
| 1514 | /// the search. |
| 1515 | Pattern(PatternID), |
| 1516 | } |
| 1517 | |
| 1518 | impl Anchored { |
| 1519 | /// Returns true if and only if this anchor mode corresponds to any kind of |
| 1520 | /// anchored search. |
| 1521 | /// |
| 1522 | /// # Example |
| 1523 | /// |
| 1524 | /// This examples shows that both `Anchored::Yes` and `Anchored::Pattern` |
| 1525 | /// are considered anchored searches. |
| 1526 | /// |
| 1527 | /// ``` |
| 1528 | /// use regex_automata::{Anchored, PatternID}; |
| 1529 | /// |
| 1530 | /// assert!(!Anchored::No.is_anchored()); |
| 1531 | /// assert!(Anchored::Yes.is_anchored()); |
| 1532 | /// assert!(Anchored::Pattern(PatternID::ZERO).is_anchored()); |
| 1533 | /// ``` |
| 1534 | #[inline ] |
| 1535 | pub fn is_anchored(&self) -> bool { |
| 1536 | matches!(*self, Anchored::Yes | Anchored::Pattern(_)) |
| 1537 | } |
| 1538 | |
| 1539 | /// Returns the pattern ID associated with this configuration if it is an |
| 1540 | /// anchored search for a specific pattern. Otherwise `None` is returned. |
| 1541 | /// |
| 1542 | /// # Example |
| 1543 | /// |
| 1544 | /// ``` |
| 1545 | /// use regex_automata::{Anchored, PatternID}; |
| 1546 | /// |
| 1547 | /// assert_eq!(None, Anchored::No.pattern()); |
| 1548 | /// assert_eq!(None, Anchored::Yes.pattern()); |
| 1549 | /// |
| 1550 | /// let pid = PatternID::must(5); |
| 1551 | /// assert_eq!(Some(pid), Anchored::Pattern(pid).pattern()); |
| 1552 | /// ``` |
| 1553 | #[inline ] |
| 1554 | pub fn pattern(&self) -> Option<PatternID> { |
| 1555 | match *self { |
| 1556 | Anchored::Pattern(pid) => Some(pid), |
| 1557 | _ => None, |
| 1558 | } |
| 1559 | } |
| 1560 | } |
| 1561 | |
| 1562 | /// The kind of match semantics to use for a regex pattern. |
| 1563 | /// |
| 1564 | /// The default match kind is `LeftmostFirst`, and this corresponds to the |
| 1565 | /// match semantics used by most backtracking engines, such as Perl. |
| 1566 | /// |
| 1567 | /// # Leftmost first or "preference order" match semantics |
| 1568 | /// |
| 1569 | /// Leftmost-first semantics determine which match to report when there are |
| 1570 | /// multiple paths through a regex that match at the same position. The tie is |
| 1571 | /// essentially broken by how a backtracker would behave. For example, consider |
| 1572 | /// running the regex `foofoofoo|foofoo|foo` on the haystack `foofoo`. In this |
| 1573 | /// case, both the `foofoo` and `foo` branches match at position `0`. So should |
| 1574 | /// the end of the match be `3` or `6`? |
| 1575 | /// |
| 1576 | /// A backtracker will conceptually work by trying `foofoofoo` and failing. |
| 1577 | /// Then it will try `foofoo`, find the match and stop there. Thus, the |
| 1578 | /// leftmost-first match position is `6`. This is called "leftmost-first" or |
| 1579 | /// "preference order" because the order of the branches as written in the |
| 1580 | /// regex pattern is what determines how to break the tie. |
| 1581 | /// |
| 1582 | /// (Note that leftmost-longest match semantics, which break ties by always |
| 1583 | /// taking the longest matching string, are not currently supported by this |
| 1584 | /// crate. These match semantics tend to be found in POSIX regex engines.) |
| 1585 | /// |
| 1586 | /// This example shows how leftmost-first semantics work, and how it even |
| 1587 | /// applies to multi-pattern regexes: |
| 1588 | /// |
| 1589 | /// ``` |
| 1590 | /// use regex_automata::{ |
| 1591 | /// nfa::thompson::pikevm::PikeVM, |
| 1592 | /// Match, |
| 1593 | /// }; |
| 1594 | /// |
| 1595 | /// let re = PikeVM::new_many(&[ |
| 1596 | /// r"foofoofoo" , |
| 1597 | /// r"foofoo" , |
| 1598 | /// r"foo" , |
| 1599 | /// ])?; |
| 1600 | /// let mut cache = re.create_cache(); |
| 1601 | /// let got: Vec<Match> = re.find_iter(&mut cache, "foofoo" ).collect(); |
| 1602 | /// let expected = vec![Match::must(1, 0..6)]; |
| 1603 | /// assert_eq!(expected, got); |
| 1604 | /// |
| 1605 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 1606 | /// ``` |
| 1607 | /// |
| 1608 | /// # All matches |
| 1609 | /// |
| 1610 | /// The `All` match semantics report any and all matches, and generally will |
| 1611 | /// attempt to match as much as possible. It doesn't respect any sort of match |
| 1612 | /// priority at all, so things like non-greedy matching don't work in this |
| 1613 | /// mode. |
| 1614 | /// |
| 1615 | /// The fact that non-greedy matching doesn't work generally makes most forms |
| 1616 | /// of unanchored non-overlapping searches have unintuitive behavior. Namely, |
| 1617 | /// unanchored searches behave as if there is a `(?s-u:.)*?` prefix at the |
| 1618 | /// beginning of the pattern, which is specifically non-greedy. Since it will |
| 1619 | /// be treated as greedy in `All` match semantics, this generally means that |
| 1620 | /// it will first attempt to consume all of the haystack and is likely to wind |
| 1621 | /// up skipping matches. |
| 1622 | /// |
| 1623 | /// Generally speaking, `All` should only be used in two circumstances: |
| 1624 | /// |
| 1625 | /// * When running an anchored search and there is a desire to match as much as |
| 1626 | /// possible. For example, when building a reverse regex matcher to find the |
| 1627 | /// start of a match after finding the end. In this case, the reverse search |
| 1628 | /// is anchored to the end of the match found by the forward search. |
| 1629 | /// * When running overlapping searches. Since `All` encodes all possible |
| 1630 | /// matches, this is generally what you want for an overlapping search. If you |
| 1631 | /// try to use leftmost-first in an overlapping search, it is likely to produce |
| 1632 | /// counter-intuitive results since leftmost-first specifically excludes some |
| 1633 | /// matches from its underlying finite state machine. |
| 1634 | /// |
| 1635 | /// This example demonstrates the counter-intuitive behavior of `All` semantics |
| 1636 | /// when using a standard leftmost unanchored search: |
| 1637 | /// |
| 1638 | /// ``` |
| 1639 | /// use regex_automata::{ |
| 1640 | /// nfa::thompson::pikevm::PikeVM, |
| 1641 | /// Match, MatchKind, |
| 1642 | /// }; |
| 1643 | /// |
| 1644 | /// let re = PikeVM::builder() |
| 1645 | /// .configure(PikeVM::config().match_kind(MatchKind::All)) |
| 1646 | /// .build("foo" )?; |
| 1647 | /// let hay = "first foo second foo wat" ; |
| 1648 | /// let mut cache = re.create_cache(); |
| 1649 | /// let got: Vec<Match> = re.find_iter(&mut cache, hay).collect(); |
| 1650 | /// // Notice that it completely skips the first 'foo'! |
| 1651 | /// let expected = vec![Match::must(0, 17..20)]; |
| 1652 | /// assert_eq!(expected, got); |
| 1653 | /// |
| 1654 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 1655 | /// ``` |
| 1656 | /// |
| 1657 | /// This second example shows how `All` semantics are useful for an overlapping |
| 1658 | /// search. Note that we use lower level lazy DFA APIs here since the NFA |
| 1659 | /// engines only currently support a very limited form of overlapping search. |
| 1660 | /// |
| 1661 | /// ``` |
| 1662 | /// use regex_automata::{ |
| 1663 | /// hybrid::dfa::{DFA, OverlappingState}, |
| 1664 | /// HalfMatch, Input, MatchKind, |
| 1665 | /// }; |
| 1666 | /// |
| 1667 | /// let re = DFA::builder() |
| 1668 | /// // If we didn't set 'All' semantics here, then the regex would only |
| 1669 | /// // match 'foo' at offset 3 and nothing else. Why? Because the state |
| 1670 | /// // machine implements preference order and knows that the 'foofoo' and |
| 1671 | /// // 'foofoofoo' branches can never match since 'foo' will always match |
| 1672 | /// // when they match and take priority. |
| 1673 | /// .configure(DFA::config().match_kind(MatchKind::All)) |
| 1674 | /// .build(r"foo|foofoo|foofoofoo" )?; |
| 1675 | /// let mut cache = re.create_cache(); |
| 1676 | /// let mut state = OverlappingState::start(); |
| 1677 | /// let input = Input::new("foofoofoo" ); |
| 1678 | /// let mut got = vec![]; |
| 1679 | /// loop { |
| 1680 | /// re.try_search_overlapping_fwd(&mut cache, &input, &mut state)?; |
| 1681 | /// let m = match state.get_match() { |
| 1682 | /// None => break, |
| 1683 | /// Some(m) => m, |
| 1684 | /// }; |
| 1685 | /// got.push(m); |
| 1686 | /// } |
| 1687 | /// let expected = vec![ |
| 1688 | /// HalfMatch::must(0, 3), |
| 1689 | /// HalfMatch::must(0, 6), |
| 1690 | /// HalfMatch::must(0, 9), |
| 1691 | /// ]; |
| 1692 | /// assert_eq!(expected, got); |
| 1693 | /// |
| 1694 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 1695 | /// ``` |
| 1696 | #[non_exhaustive ] |
| 1697 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
| 1698 | pub enum MatchKind { |
| 1699 | /// Report all possible matches. |
| 1700 | All, |
| 1701 | /// Report only the leftmost matches. When multiple leftmost matches exist, |
| 1702 | /// report the match corresponding to the part of the regex that appears |
| 1703 | /// first in the syntax. |
| 1704 | LeftmostFirst, |
| 1705 | // There is prior art in RE2 that shows that we should be able to add |
| 1706 | // LeftmostLongest too. The tricky part of it is supporting ungreedy |
| 1707 | // repetitions. Instead of treating all NFA states as having equivalent |
| 1708 | // priority (as in 'All') or treating all NFA states as having distinct |
| 1709 | // priority based on order (as in 'LeftmostFirst'), we instead group NFA |
| 1710 | // states into sets, and treat members of each set as having equivalent |
| 1711 | // priority, but having greater priority than all following members |
| 1712 | // of different sets. |
| 1713 | // |
| 1714 | // However, it's not clear whether it's really worth adding this. After |
| 1715 | // all, leftmost-longest can be emulated when using literals by using |
| 1716 | // leftmost-first and sorting the literals by length in descending order. |
| 1717 | // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will |
| 1718 | // always match `a` in `ab` when using leftmost-first, but leftmost-longest |
| 1719 | // would match `ab`. |
| 1720 | } |
| 1721 | |
| 1722 | impl MatchKind { |
| 1723 | #[cfg (feature = "alloc" )] |
| 1724 | pub(crate) fn continue_past_first_match(&self) -> bool { |
| 1725 | *self == MatchKind::All |
| 1726 | } |
| 1727 | } |
| 1728 | |
| 1729 | impl Default for MatchKind { |
| 1730 | fn default() -> MatchKind { |
| 1731 | MatchKind::LeftmostFirst |
| 1732 | } |
| 1733 | } |
| 1734 | |
| 1735 | /// An error indicating that a search stopped before reporting whether a |
| 1736 | /// match exists or not. |
| 1737 | /// |
| 1738 | /// To be very clear, this error type implies that one cannot assume that no |
| 1739 | /// matches occur, since the search stopped before completing. That is, if |
| 1740 | /// you're looking for information about where a search determined that no |
| 1741 | /// match can occur, then this error type does *not* give you that. (Indeed, at |
| 1742 | /// the time of writing, if you need such a thing, you have to write your own |
| 1743 | /// search routine.) |
| 1744 | /// |
| 1745 | /// Normally, when one searches for something, the response is either an |
| 1746 | /// affirmative "it was found at this location" or a negative "not found at |
| 1747 | /// all." However, in some cases, a regex engine can be configured to stop its |
| 1748 | /// search before concluding whether a match exists or not. When this happens, |
| 1749 | /// it may be important for the caller to know why the regex engine gave up and |
| 1750 | /// where in the input it gave up at. This error type exposes the 'why' and the |
| 1751 | /// 'where.' |
| 1752 | /// |
| 1753 | /// For example, the DFAs provided by this library generally cannot correctly |
| 1754 | /// implement Unicode word boundaries. Instead, they provide an option to |
| 1755 | /// eagerly support them on ASCII text (since Unicode word boundaries are |
| 1756 | /// equivalent to ASCII word boundaries when searching ASCII text), but will |
| 1757 | /// "give up" if a non-ASCII byte is seen. In such cases, one is usually |
| 1758 | /// required to either report the failure to the caller (unergonomic) or |
| 1759 | /// otherwise fall back to some other regex engine (ergonomic, but potentially |
| 1760 | /// costly). |
| 1761 | /// |
| 1762 | /// More generally, some regex engines offer the ability for callers to specify |
| 1763 | /// certain bytes that will trigger the regex engine to automatically quit if |
| 1764 | /// they are seen. |
| 1765 | /// |
| 1766 | /// Still yet, there may be other reasons for a failed match. For example, |
| 1767 | /// the hybrid DFA provided by this crate can be configured to give up if it |
| 1768 | /// believes that it is not efficient. This in turn permits callers to choose a |
| 1769 | /// different regex engine. |
| 1770 | /// |
| 1771 | /// (Note that DFAs are configured by default to never quit or give up in this |
| 1772 | /// fashion. For example, by default, a DFA will fail to build if the regex |
| 1773 | /// pattern contains a Unicode word boundary. One needs to opt into the "quit" |
| 1774 | /// behavior via options, like |
| 1775 | /// [`hybrid::dfa::Config::unicode_word_boundary`](crate::hybrid::dfa::Config::unicode_word_boundary).) |
| 1776 | /// |
| 1777 | /// There are a couple other ways a search |
| 1778 | /// can fail. For example, when using the |
| 1779 | /// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker) |
| 1780 | /// with a haystack that is too long, or trying to run an unanchored search |
| 1781 | /// with a [one-pass DFA](crate::dfa::onepass). |
| 1782 | #[derive (Clone, Debug, Eq, PartialEq)] |
| 1783 | pub struct MatchError( |
| 1784 | #[cfg (feature = "alloc" )] alloc::boxed::Box<MatchErrorKind>, |
| 1785 | #[cfg (not(feature = "alloc" ))] MatchErrorKind, |
| 1786 | ); |
| 1787 | |
| 1788 | impl MatchError { |
| 1789 | /// Create a new error value with the given kind. |
| 1790 | /// |
| 1791 | /// This is a more verbose version of the kind-specific constructors, |
| 1792 | /// e.g., `MatchError::quit`. |
| 1793 | pub fn new(kind: MatchErrorKind) -> MatchError { |
| 1794 | #[cfg (feature = "alloc" )] |
| 1795 | { |
| 1796 | MatchError(alloc::boxed::Box::new(kind)) |
| 1797 | } |
| 1798 | #[cfg (not(feature = "alloc" ))] |
| 1799 | { |
| 1800 | MatchError(kind) |
| 1801 | } |
| 1802 | } |
| 1803 | |
| 1804 | /// Returns a reference to the underlying error kind. |
| 1805 | pub fn kind(&self) -> &MatchErrorKind { |
| 1806 | &self.0 |
| 1807 | } |
| 1808 | |
| 1809 | /// Create a new "quit" error. The given `byte` corresponds to the value |
| 1810 | /// that tripped a search's quit condition, and `offset` corresponds to the |
| 1811 | /// location in the haystack at which the search quit. |
| 1812 | /// |
| 1813 | /// This is the same as calling `MatchError::new` with a |
| 1814 | /// [`MatchErrorKind::Quit`] kind. |
| 1815 | pub fn quit(byte: u8, offset: usize) -> MatchError { |
| 1816 | MatchError::new(MatchErrorKind::Quit { byte, offset }) |
| 1817 | } |
| 1818 | |
| 1819 | /// Create a new "gave up" error. The given `offset` corresponds to the |
| 1820 | /// location in the haystack at which the search gave up. |
| 1821 | /// |
| 1822 | /// This is the same as calling `MatchError::new` with a |
| 1823 | /// [`MatchErrorKind::GaveUp`] kind. |
| 1824 | pub fn gave_up(offset: usize) -> MatchError { |
| 1825 | MatchError::new(MatchErrorKind::GaveUp { offset }) |
| 1826 | } |
| 1827 | |
| 1828 | /// Create a new "haystack too long" error. The given `len` corresponds to |
| 1829 | /// the length of the haystack that was problematic. |
| 1830 | /// |
| 1831 | /// This is the same as calling `MatchError::new` with a |
| 1832 | /// [`MatchErrorKind::HaystackTooLong`] kind. |
| 1833 | pub fn haystack_too_long(len: usize) -> MatchError { |
| 1834 | MatchError::new(MatchErrorKind::HaystackTooLong { len }) |
| 1835 | } |
| 1836 | |
| 1837 | /// Create a new "unsupported anchored" error. This occurs when the caller |
| 1838 | /// requests a search with an anchor mode that is not supported by the |
| 1839 | /// regex engine. |
| 1840 | /// |
| 1841 | /// This is the same as calling `MatchError::new` with a |
| 1842 | /// [`MatchErrorKind::UnsupportedAnchored`] kind. |
| 1843 | pub fn unsupported_anchored(mode: Anchored) -> MatchError { |
| 1844 | MatchError::new(MatchErrorKind::UnsupportedAnchored { mode }) |
| 1845 | } |
| 1846 | } |
| 1847 | |
| 1848 | /// The underlying kind of a [`MatchError`]. |
| 1849 | /// |
| 1850 | /// This is a **non-exhaustive** enum. That means new variants may be added in |
| 1851 | /// a semver-compatible release. |
| 1852 | #[non_exhaustive ] |
| 1853 | #[derive (Clone, Debug, Eq, PartialEq)] |
| 1854 | pub enum MatchErrorKind { |
| 1855 | /// The search saw a "quit" byte at which it was instructed to stop |
| 1856 | /// searching. |
| 1857 | Quit { |
| 1858 | /// The "quit" byte that was observed that caused the search to stop. |
| 1859 | byte: u8, |
| 1860 | /// The offset at which the quit byte was observed. |
| 1861 | offset: usize, |
| 1862 | }, |
| 1863 | /// The search, based on heuristics, determined that it would be better |
| 1864 | /// to stop, typically to provide the caller an opportunity to use an |
| 1865 | /// alternative regex engine. |
| 1866 | /// |
| 1867 | /// Currently, the only way for this to occur is via the lazy DFA and |
| 1868 | /// only when it is configured to do so (it will not return this error by |
| 1869 | /// default). |
| 1870 | GaveUp { |
| 1871 | /// The offset at which the search stopped. This corresponds to the |
| 1872 | /// position immediately following the last byte scanned. |
| 1873 | offset: usize, |
| 1874 | }, |
| 1875 | /// This error occurs if the haystack given to the regex engine was too |
| 1876 | /// long to be searched. This occurs, for example, with regex engines |
| 1877 | /// like the bounded backtracker that have a configurable fixed amount of |
| 1878 | /// capacity that is tied to the length of the haystack. Anything beyond |
| 1879 | /// that configured limit will result in an error at search time. |
| 1880 | HaystackTooLong { |
| 1881 | /// The length of the haystack that exceeded the limit. |
| 1882 | len: usize, |
| 1883 | }, |
| 1884 | /// An error indicating that a particular type of anchored search was |
| 1885 | /// requested, but that the regex engine does not support it. |
| 1886 | /// |
| 1887 | /// Note that this error should not be returned by a regex engine simply |
| 1888 | /// because the pattern ID is invalid (i.e., equal to or exceeds the number |
| 1889 | /// of patterns in the regex). In that case, the regex engine should report |
| 1890 | /// a non-match. |
| 1891 | UnsupportedAnchored { |
| 1892 | /// The anchored mode given that is unsupported. |
| 1893 | mode: Anchored, |
| 1894 | }, |
| 1895 | } |
| 1896 | |
| 1897 | #[cfg (feature = "std" )] |
| 1898 | impl std::error::Error for MatchError {} |
| 1899 | |
| 1900 | impl core::fmt::Display for MatchError { |
| 1901 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
| 1902 | match *self.kind() { |
| 1903 | MatchErrorKind::Quit { byte, offset } => write!( |
| 1904 | f, |
| 1905 | "quit search after observing byte {:?} at offset {}" , |
| 1906 | DebugByte(byte), |
| 1907 | offset, |
| 1908 | ), |
| 1909 | MatchErrorKind::GaveUp { offset } => { |
| 1910 | write!(f, "gave up searching at offset {}" , offset) |
| 1911 | } |
| 1912 | MatchErrorKind::HaystackTooLong { len } => { |
| 1913 | write!(f, "haystack of length {} is too long" , len) |
| 1914 | } |
| 1915 | MatchErrorKind::UnsupportedAnchored { mode: Anchored::Yes } => { |
| 1916 | write!(f, "anchored searches are not supported or enabled" ) |
| 1917 | } |
| 1918 | MatchErrorKind::UnsupportedAnchored { mode: Anchored::No } => { |
| 1919 | write!(f, "unanchored searches are not supported or enabled" ) |
| 1920 | } |
| 1921 | MatchErrorKind::UnsupportedAnchored { |
| 1922 | mode: Anchored::Pattern(pid), |
| 1923 | } => { |
| 1924 | write!( |
| 1925 | f, |
| 1926 | "anchored searches for a specific pattern ( {}) are \ |
| 1927 | not supported or enabled" , |
| 1928 | pid.as_usize(), |
| 1929 | ) |
| 1930 | } |
| 1931 | } |
| 1932 | } |
| 1933 | } |
| 1934 | |
| 1935 | #[cfg (test)] |
| 1936 | mod tests { |
| 1937 | use super::*; |
| 1938 | |
| 1939 | // We test that our 'MatchError' type is the size we expect. This isn't an |
| 1940 | // API guarantee, but if the size increases, we really want to make sure we |
| 1941 | // decide to do that intentionally. So this should be a speed bump. And in |
| 1942 | // general, we should not increase the size without a very good reason. |
| 1943 | // |
| 1944 | // Why? Because low level search APIs return Result<.., MatchError>. When |
| 1945 | // MatchError gets bigger, so to does the Result type. |
| 1946 | // |
| 1947 | // Now, when 'alloc' is enabled, we do box the error, which de-emphasizes |
| 1948 | // the importance of keeping a small error type. But without 'alloc', we |
| 1949 | // still want things to be small. |
| 1950 | #[test ] |
| 1951 | fn match_error_size() { |
| 1952 | let expected_size = if cfg!(feature = "alloc" ) { |
| 1953 | core::mem::size_of::<usize>() |
| 1954 | } else { |
| 1955 | 2 * core::mem::size_of::<usize>() |
| 1956 | }; |
| 1957 | assert_eq!(expected_size, core::mem::size_of::<MatchError>()); |
| 1958 | } |
| 1959 | |
| 1960 | // Same as above, but for the underlying match error kind. |
| 1961 | #[cfg (target_pointer_width = "64" )] |
| 1962 | #[test ] |
| 1963 | fn match_error_kind_size() { |
| 1964 | let expected_size = 2 * core::mem::size_of::<usize>(); |
| 1965 | assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>()); |
| 1966 | } |
| 1967 | |
| 1968 | #[cfg (target_pointer_width = "32" )] |
| 1969 | #[test ] |
| 1970 | fn match_error_kind_size() { |
| 1971 | let expected_size = 3 * core::mem::size_of::<usize>(); |
| 1972 | assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>()); |
| 1973 | } |
| 1974 | |
| 1975 | #[test ] |
| 1976 | fn incorrect_asref_guard() { |
| 1977 | struct Bad(std::cell::Cell<bool>); |
| 1978 | |
| 1979 | impl AsRef<[u8]> for Bad { |
| 1980 | fn as_ref(&self) -> &[u8] { |
| 1981 | if self.0.replace(false) { |
| 1982 | &[] |
| 1983 | } else { |
| 1984 | &[0; 1000] |
| 1985 | } |
| 1986 | } |
| 1987 | } |
| 1988 | |
| 1989 | let bad = Bad(std::cell::Cell::new(true)); |
| 1990 | let input = Input::new(&bad); |
| 1991 | assert!(input.end() <= input.haystack().len()); |
| 1992 | } |
| 1993 | } |
| 1994 | |