1 | use core::ops::{Range, RangeBounds}; |
2 | |
3 | use crate::util::primitives::PatternID; |
4 | |
5 | /// The configuration and the haystack to use for an Aho-Corasick search. |
6 | /// |
7 | /// When executing a search, there are a few parameters one might want to |
8 | /// configure: |
9 | /// |
10 | /// * The haystack to search, provided to the [`Input::new`] constructor. This |
11 | /// is the only required parameter. |
12 | /// * The span _within_ the haystack to limit a search to. (The default |
13 | /// is the entire haystack.) This is configured via [`Input::span`] or |
14 | /// [`Input::range`]. |
15 | /// * Whether to run an unanchored (matches can occur anywhere after the |
16 | /// start of the search) or anchored (matches can only occur beginning at |
17 | /// the start of the search) search. Unanchored search is the default. This is |
18 | /// configured via [`Input::anchored`]. |
19 | /// * Whether to quit the search as soon as a match has been found, regardless |
20 | /// of the [`MatchKind`] that the searcher was built with. This is configured |
21 | /// via [`Input::earliest`]. |
22 | /// |
23 | /// For most cases, the defaults for all optional parameters are appropriate. |
24 | /// The utility of this type is that it keeps the default or common case simple |
25 | /// while permitting tweaking parameters in more niche use cases while reusing |
26 | /// the same search APIs. |
27 | /// |
28 | /// # Valid bounds and search termination |
29 | /// |
30 | /// An `Input` permits setting the bounds of a search via either |
31 | /// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or |
32 | /// else a panic will occur. Bounds are valid if and only if: |
33 | /// |
34 | /// * The bounds represent a valid range into the input's haystack. |
35 | /// * **or** the end bound is a valid ending bound for the haystack *and* |
36 | /// the start bound is exactly one greater than the end bound. |
37 | /// |
38 | /// In the latter case, [`Input::is_done`] will return true and indicates any |
39 | /// search receiving such an input should immediately return with no match. |
40 | /// |
41 | /// Other than representing "search is complete," the `Input::span` and |
42 | /// `Input::range` APIs are never necessary. Instead, callers can slice the |
43 | /// haystack instead, e.g., with `&haystack[start..end]`. With that said, they |
44 | /// can be more convenient than slicing because the match positions reported |
45 | /// when using `Input::span` or `Input::range` are in terms of the original |
46 | /// haystack. If you instead use `&haystack[start..end]`, then you'll need to |
47 | /// add `start` to any match position returned in order for it to be a correct |
48 | /// index into `haystack`. |
49 | /// |
50 | /// # Example: `&str` and `&[u8]` automatically convert to an `Input` |
51 | /// |
52 | /// There is a `From<&T> for Input` implementation for all `T: AsRef<[u8]>`. |
53 | /// Additionally, the [`AhoCorasick`](crate::AhoCorasick) search APIs accept |
54 | /// a `Into<Input>`. These two things combined together mean you can provide |
55 | /// things like `&str` and `&[u8]` to search APIs when the defaults are |
56 | /// suitable, but also an `Input` when they're not. For example: |
57 | /// |
58 | /// ``` |
59 | /// use aho_corasick::{AhoCorasick, Anchored, Input, Match, StartKind}; |
60 | /// |
61 | /// // Build a searcher that supports both unanchored and anchored modes. |
62 | /// let ac = AhoCorasick::builder() |
63 | /// .start_kind(StartKind::Both) |
64 | /// .build(&["abcd" , "b" ]) |
65 | /// .unwrap(); |
66 | /// let haystack = "abcd" ; |
67 | /// |
68 | /// // A search using default parameters is unanchored. With standard |
69 | /// // semantics, this finds `b` first. |
70 | /// assert_eq!( |
71 | /// Some(Match::must(1, 1..2)), |
72 | /// ac.find(haystack), |
73 | /// ); |
74 | /// // Using the same 'find' routine, we can provide an 'Input' explicitly |
75 | /// // that is configured to do an anchored search. Since 'b' doesn't start |
76 | /// // at the beginning of the search, it is not reported as a match. |
77 | /// assert_eq!( |
78 | /// Some(Match::must(0, 0..4)), |
79 | /// ac.find(Input::new(haystack).anchored(Anchored::Yes)), |
80 | /// ); |
81 | /// ``` |
82 | #[derive(Clone)] |
83 | pub struct Input<'h> { |
84 | haystack: &'h [u8], |
85 | span: Span, |
86 | anchored: Anchored, |
87 | earliest: bool, |
88 | } |
89 | |
90 | impl<'h> Input<'h> { |
91 | /// Create a new search configuration for the given haystack. |
92 | #[inline ] |
93 | pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> { |
94 | Input { |
95 | haystack: haystack.as_ref(), |
96 | span: Span { start: 0, end: haystack.as_ref().len() }, |
97 | anchored: Anchored::No, |
98 | earliest: false, |
99 | } |
100 | } |
101 | |
102 | /// Set the span for this search. |
103 | /// |
104 | /// This routine is generic over how a span is provided. While |
105 | /// a [`Span`] may be given directly, one may also provide a |
106 | /// `std::ops::Range<usize>`. To provide anything supported by range |
107 | /// syntax, use the [`Input::range`] method. |
108 | /// |
109 | /// The default span is the entire haystack. |
110 | /// |
111 | /// Note that [`Input::range`] overrides this method and vice versa. |
112 | /// |
113 | /// # Panics |
114 | /// |
115 | /// This panics if the given span does not correspond to valid bounds in |
116 | /// the haystack or the termination of a search. |
117 | /// |
118 | /// # Example |
119 | /// |
120 | /// This example shows how the span of the search can impact whether a |
121 | /// match is reported or not. |
122 | /// |
123 | /// ``` |
124 | /// use aho_corasick::{AhoCorasick, Input, MatchKind}; |
125 | /// |
126 | /// let patterns = &["b" , "abcd" , "abc" ]; |
127 | /// let haystack = "abcd" ; |
128 | /// |
129 | /// let ac = AhoCorasick::builder() |
130 | /// .match_kind(MatchKind::LeftmostFirst) |
131 | /// .build(patterns) |
132 | /// .unwrap(); |
133 | /// let input = Input::new(haystack).span(0..3); |
134 | /// let mat = ac.try_find(input)?.expect("should have a match" ); |
135 | /// // Without the span stopping the search early, 'abcd' would be reported |
136 | /// // because it is the correct leftmost-first match. |
137 | /// assert_eq!("abc" , &haystack[mat.span()]); |
138 | /// |
139 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
140 | /// ``` |
141 | #[inline ] |
142 | pub fn span<S: Into<Span>>(mut self, span: S) -> Input<'h> { |
143 | self.set_span(span); |
144 | self |
145 | } |
146 | |
147 | /// Like `Input::span`, but accepts any range instead. |
148 | /// |
149 | /// The default range is the entire haystack. |
150 | /// |
151 | /// Note that [`Input::span`] overrides this method and vice versa. |
152 | /// |
153 | /// # Panics |
154 | /// |
155 | /// This routine will panic if the given range could not be converted |
156 | /// to a valid [`Range`]. For example, this would panic when given |
157 | /// `0..=usize::MAX` since it cannot be represented using a half-open |
158 | /// interval in terms of `usize`. |
159 | /// |
160 | /// This routine also panics if the given range does not correspond to |
161 | /// valid bounds in the haystack or the termination of a search. |
162 | /// |
163 | /// # Example |
164 | /// |
165 | /// ``` |
166 | /// use aho_corasick::Input; |
167 | /// |
168 | /// let input = Input::new("foobar" ); |
169 | /// assert_eq!(0..6, input.get_range()); |
170 | /// |
171 | /// let input = Input::new("foobar" ).range(2..=4); |
172 | /// assert_eq!(2..5, input.get_range()); |
173 | /// ``` |
174 | #[inline ] |
175 | pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h> { |
176 | self.set_range(range); |
177 | self |
178 | } |
179 | |
180 | /// Sets the anchor mode of a search. |
181 | /// |
182 | /// When a search is anchored (via [`Anchored::Yes`]), a match must begin |
183 | /// at the start of a search. When a search is not anchored (that's |
184 | /// [`Anchored::No`]), searchers will look for a match anywhere in the |
185 | /// haystack. |
186 | /// |
187 | /// By default, the anchored mode is [`Anchored::No`]. |
188 | /// |
189 | /// # Support for anchored searches |
190 | /// |
191 | /// Anchored or unanchored searches might not always be available, |
192 | /// depending on the type of searcher used and its configuration: |
193 | /// |
194 | /// * [`noncontiguous::NFA`](crate::nfa::noncontiguous::NFA) always |
195 | /// supports both unanchored and anchored searches. |
196 | /// * [`contiguous::NFA`](crate::nfa::contiguous::NFA) always supports both |
197 | /// unanchored and anchored searches. |
198 | /// * [`dfa::DFA`](crate::dfa::DFA) supports only unanchored |
199 | /// searches by default. |
200 | /// [`dfa::Builder::start_kind`](crate::dfa::Builder::start_kind) can |
201 | /// be used to change the default to supporting both kinds of searches |
202 | /// or even just anchored searches. |
203 | /// * [`AhoCorasick`](crate::AhoCorasick) inherits the same setup as a |
204 | /// `DFA`. Namely, it only supports unanchored searches by default, but |
205 | /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind) |
206 | /// can change this. |
207 | /// |
208 | /// If you try to execute a search using a `try_` ("fallible") method with |
209 | /// an unsupported anchor mode, then an error will be returned. For calls |
210 | /// to infallible search methods, a panic will result. |
211 | /// |
212 | /// # Example |
213 | /// |
214 | /// This demonstrates the differences between an anchored search and |
215 | /// an unanchored search. Notice that we build our `AhoCorasick` searcher |
216 | /// with [`StartKind::Both`] so that it supports both unanchored and |
217 | /// anchored searches simultaneously. |
218 | /// |
219 | /// ``` |
220 | /// use aho_corasick::{ |
221 | /// AhoCorasick, Anchored, Input, MatchKind, StartKind, |
222 | /// }; |
223 | /// |
224 | /// let patterns = &["bcd" ]; |
225 | /// let haystack = "abcd" ; |
226 | /// |
227 | /// let ac = AhoCorasick::builder() |
228 | /// .start_kind(StartKind::Both) |
229 | /// .build(patterns) |
230 | /// .unwrap(); |
231 | /// |
232 | /// // Note that 'Anchored::No' is the default, so it doesn't need to |
233 | /// // be explicitly specified here. |
234 | /// let input = Input::new(haystack); |
235 | /// let mat = ac.try_find(input)?.expect("should have a match" ); |
236 | /// assert_eq!("bcd" , &haystack[mat.span()]); |
237 | /// |
238 | /// // While 'bcd' occurs in the haystack, it does not begin where our |
239 | /// // search begins, so no match is found. |
240 | /// let input = Input::new(haystack).anchored(Anchored::Yes); |
241 | /// assert_eq!(None, ac.try_find(input)?); |
242 | /// |
243 | /// // However, if we start our search where 'bcd' starts, then we will |
244 | /// // find a match. |
245 | /// let input = Input::new(haystack).range(1..).anchored(Anchored::Yes); |
246 | /// let mat = ac.try_find(input)?.expect("should have a match" ); |
247 | /// assert_eq!("bcd" , &haystack[mat.span()]); |
248 | /// |
249 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
250 | /// ``` |
251 | #[inline ] |
252 | pub fn anchored(mut self, mode: Anchored) -> Input<'h> { |
253 | self.set_anchored(mode); |
254 | self |
255 | } |
256 | |
257 | /// Whether to execute an "earliest" search or not. |
258 | /// |
259 | /// When running a non-overlapping search, an "earliest" search will |
260 | /// return the match location as early as possible. For example, given |
261 | /// the patterns `abc` and `b`, and a haystack of `abc`, a normal |
262 | /// leftmost-first search will return `abc` as a match. But an "earliest" |
263 | /// search will return as soon as it is known that a match occurs, which |
264 | /// happens once `b` is seen. |
265 | /// |
266 | /// Note that when using [`MatchKind::Standard`], the "earliest" option |
267 | /// has no effect since standard semantics are already "earliest." Note |
268 | /// also that this has no effect in overlapping searches, since overlapping |
269 | /// searches also use standard semantics and report all possible matches. |
270 | /// |
271 | /// This is disabled by default. |
272 | /// |
273 | /// # Example |
274 | /// |
275 | /// This example shows the difference between "earliest" searching and |
276 | /// normal leftmost searching. |
277 | /// |
278 | /// ``` |
279 | /// use aho_corasick::{AhoCorasick, Anchored, Input, MatchKind, StartKind}; |
280 | /// |
281 | /// let patterns = &["abc" , "b" ]; |
282 | /// let haystack = "abc" ; |
283 | /// |
284 | /// let ac = AhoCorasick::builder() |
285 | /// .match_kind(MatchKind::LeftmostFirst) |
286 | /// .build(patterns) |
287 | /// .unwrap(); |
288 | /// |
289 | /// // The normal leftmost-first match. |
290 | /// let input = Input::new(haystack); |
291 | /// let mat = ac.try_find(input)?.expect("should have a match" ); |
292 | /// assert_eq!("abc" , &haystack[mat.span()]); |
293 | /// |
294 | /// // The "earliest" possible match, even if it isn't leftmost-first. |
295 | /// let input = Input::new(haystack).earliest(true); |
296 | /// let mat = ac.try_find(input)?.expect("should have a match" ); |
297 | /// assert_eq!("b" , &haystack[mat.span()]); |
298 | /// |
299 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
300 | /// ``` |
301 | #[inline ] |
302 | pub fn earliest(mut self, yes: bool) -> Input<'h> { |
303 | self.set_earliest(yes); |
304 | self |
305 | } |
306 | |
307 | /// Set the span for this search configuration. |
308 | /// |
309 | /// This is like the [`Input::span`] method, except this mutates the |
310 | /// span in place. |
311 | /// |
312 | /// This routine is generic over how a span is provided. While |
313 | /// a [`Span`] may be given directly, one may also provide a |
314 | /// `std::ops::Range<usize>`. |
315 | /// |
316 | /// # Panics |
317 | /// |
318 | /// This panics if the given span does not correspond to valid bounds in |
319 | /// the haystack or the termination of a search. |
320 | /// |
321 | /// # Example |
322 | /// |
323 | /// ``` |
324 | /// use aho_corasick::Input; |
325 | /// |
326 | /// let mut input = Input::new("foobar" ); |
327 | /// assert_eq!(0..6, input.get_range()); |
328 | /// input.set_span(2..4); |
329 | /// assert_eq!(2..4, input.get_range()); |
330 | /// ``` |
331 | #[inline ] |
332 | pub fn set_span<S: Into<Span>>(&mut self, span: S) { |
333 | let span = span.into(); |
334 | assert!( |
335 | span.end <= self.haystack.len() |
336 | && span.start <= span.end.wrapping_add(1), |
337 | "invalid span {:?} for haystack of length {}" , |
338 | span, |
339 | self.haystack.len(), |
340 | ); |
341 | self.span = span; |
342 | } |
343 | |
344 | /// Set the span for this search configuration given any range. |
345 | /// |
346 | /// This is like the [`Input::range`] method, except this mutates the |
347 | /// span in place. |
348 | /// |
349 | /// # Panics |
350 | /// |
351 | /// This routine will panic if the given range could not be converted |
352 | /// to a valid [`Range`]. For example, this would panic when given |
353 | /// `0..=usize::MAX` since it cannot be represented using a half-open |
354 | /// interval in terms of `usize`. |
355 | /// |
356 | /// This routine also panics if the given range does not correspond to |
357 | /// valid bounds in the haystack or the termination of a search. |
358 | /// |
359 | /// # Example |
360 | /// |
361 | /// ``` |
362 | /// use aho_corasick::Input; |
363 | /// |
364 | /// let mut input = Input::new("foobar" ); |
365 | /// assert_eq!(0..6, input.get_range()); |
366 | /// input.set_range(2..=4); |
367 | /// assert_eq!(2..5, input.get_range()); |
368 | /// ``` |
369 | #[inline ] |
370 | pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) { |
371 | use core::ops::Bound; |
372 | |
373 | // It's a little weird to convert ranges into spans, and then spans |
374 | // back into ranges when we actually slice the haystack. Because |
375 | // of that process, we always represent everything as a half-open |
376 | // internal. Therefore, handling things like m..=n is a little awkward. |
377 | let start = match range.start_bound() { |
378 | Bound::Included(&i) => i, |
379 | // Can this case ever happen? Range syntax doesn't support it... |
380 | Bound::Excluded(&i) => i.checked_add(1).unwrap(), |
381 | Bound::Unbounded => 0, |
382 | }; |
383 | let end = match range.end_bound() { |
384 | Bound::Included(&i) => i.checked_add(1).unwrap(), |
385 | Bound::Excluded(&i) => i, |
386 | Bound::Unbounded => self.haystack().len(), |
387 | }; |
388 | self.set_span(Span { start, end }); |
389 | } |
390 | |
391 | /// Set the starting offset for the span for this search configuration. |
392 | /// |
393 | /// This is a convenience routine for only mutating the start of a span |
394 | /// without having to set the entire span. |
395 | /// |
396 | /// # Panics |
397 | /// |
398 | /// This panics if the given span does not correspond to valid bounds in |
399 | /// the haystack or the termination of a search. |
400 | /// |
401 | /// # Example |
402 | /// |
403 | /// ``` |
404 | /// use aho_corasick::Input; |
405 | /// |
406 | /// let mut input = Input::new("foobar" ); |
407 | /// assert_eq!(0..6, input.get_range()); |
408 | /// input.set_start(5); |
409 | /// assert_eq!(5..6, input.get_range()); |
410 | /// ``` |
411 | #[inline ] |
412 | pub fn set_start(&mut self, start: usize) { |
413 | self.set_span(Span { start, ..self.get_span() }); |
414 | } |
415 | |
416 | /// Set the ending offset for the span for this search configuration. |
417 | /// |
418 | /// This is a convenience routine for only mutating the end of a span |
419 | /// without having to set the entire span. |
420 | /// |
421 | /// # Panics |
422 | /// |
423 | /// This panics if the given span does not correspond to valid bounds in |
424 | /// the haystack or the termination of a search. |
425 | /// |
426 | /// # Example |
427 | /// |
428 | /// ``` |
429 | /// use aho_corasick::Input; |
430 | /// |
431 | /// let mut input = Input::new("foobar" ); |
432 | /// assert_eq!(0..6, input.get_range()); |
433 | /// input.set_end(5); |
434 | /// assert_eq!(0..5, input.get_range()); |
435 | /// ``` |
436 | #[inline ] |
437 | pub fn set_end(&mut self, end: usize) { |
438 | self.set_span(Span { end, ..self.get_span() }); |
439 | } |
440 | |
441 | /// Set the anchor mode of a search. |
442 | /// |
443 | /// This is like [`Input::anchored`], except it mutates the search |
444 | /// configuration in place. |
445 | /// |
446 | /// # Example |
447 | /// |
448 | /// ``` |
449 | /// use aho_corasick::{Anchored, Input}; |
450 | /// |
451 | /// let mut input = Input::new("foobar" ); |
452 | /// assert_eq!(Anchored::No, input.get_anchored()); |
453 | /// |
454 | /// input.set_anchored(Anchored::Yes); |
455 | /// assert_eq!(Anchored::Yes, input.get_anchored()); |
456 | /// ``` |
457 | #[inline ] |
458 | pub fn set_anchored(&mut self, mode: Anchored) { |
459 | self.anchored = mode; |
460 | } |
461 | |
462 | /// Set whether the search should execute in "earliest" mode or not. |
463 | /// |
464 | /// This is like [`Input::earliest`], except it mutates the search |
465 | /// configuration in place. |
466 | /// |
467 | /// # Example |
468 | /// |
469 | /// ``` |
470 | /// use aho_corasick::Input; |
471 | /// |
472 | /// let mut input = Input::new("foobar" ); |
473 | /// assert!(!input.get_earliest()); |
474 | /// input.set_earliest(true); |
475 | /// assert!(input.get_earliest()); |
476 | /// ``` |
477 | #[inline ] |
478 | pub fn set_earliest(&mut self, yes: bool) { |
479 | self.earliest = yes; |
480 | } |
481 | |
482 | /// Return a borrow of the underlying haystack as a slice of bytes. |
483 | /// |
484 | /// # Example |
485 | /// |
486 | /// ``` |
487 | /// use aho_corasick::Input; |
488 | /// |
489 | /// let input = Input::new("foobar" ); |
490 | /// assert_eq!(b"foobar" , input.haystack()); |
491 | /// ``` |
492 | #[inline ] |
493 | pub fn haystack(&self) -> &[u8] { |
494 | self.haystack |
495 | } |
496 | |
497 | /// Return the start position of this search. |
498 | /// |
499 | /// This is a convenience routine for `search.get_span().start()`. |
500 | /// |
501 | /// # Example |
502 | /// |
503 | /// ``` |
504 | /// use aho_corasick::Input; |
505 | /// |
506 | /// let input = Input::new("foobar" ); |
507 | /// assert_eq!(0, input.start()); |
508 | /// |
509 | /// let input = Input::new("foobar" ).span(2..4); |
510 | /// assert_eq!(2, input.start()); |
511 | /// ``` |
512 | #[inline ] |
513 | pub fn start(&self) -> usize { |
514 | self.get_span().start |
515 | } |
516 | |
517 | /// Return the end position of this search. |
518 | /// |
519 | /// This is a convenience routine for `search.get_span().end()`. |
520 | /// |
521 | /// # Example |
522 | /// |
523 | /// ``` |
524 | /// use aho_corasick::Input; |
525 | /// |
526 | /// let input = Input::new("foobar" ); |
527 | /// assert_eq!(6, input.end()); |
528 | /// |
529 | /// let input = Input::new("foobar" ).span(2..4); |
530 | /// assert_eq!(4, input.end()); |
531 | /// ``` |
532 | #[inline ] |
533 | pub fn end(&self) -> usize { |
534 | self.get_span().end |
535 | } |
536 | |
537 | /// Return the span for this search configuration. |
538 | /// |
539 | /// If one was not explicitly set, then the span corresponds to the entire |
540 | /// range of the haystack. |
541 | /// |
542 | /// # Example |
543 | /// |
544 | /// ``` |
545 | /// use aho_corasick::{Input, Span}; |
546 | /// |
547 | /// let input = Input::new("foobar" ); |
548 | /// assert_eq!(Span { start: 0, end: 6 }, input.get_span()); |
549 | /// ``` |
550 | #[inline ] |
551 | pub fn get_span(&self) -> Span { |
552 | self.span |
553 | } |
554 | |
555 | /// Return the span as a range for this search configuration. |
556 | /// |
557 | /// If one was not explicitly set, then the span corresponds to the entire |
558 | /// range of the haystack. |
559 | /// |
560 | /// # Example |
561 | /// |
562 | /// ``` |
563 | /// use aho_corasick::Input; |
564 | /// |
565 | /// let input = Input::new("foobar" ); |
566 | /// assert_eq!(0..6, input.get_range()); |
567 | /// ``` |
568 | #[inline ] |
569 | pub fn get_range(&self) -> Range<usize> { |
570 | self.get_span().range() |
571 | } |
572 | |
573 | /// Return the anchored mode for this search configuration. |
574 | /// |
575 | /// If no anchored mode was set, then it defaults to [`Anchored::No`]. |
576 | /// |
577 | /// # Example |
578 | /// |
579 | /// ``` |
580 | /// use aho_corasick::{Anchored, Input}; |
581 | /// |
582 | /// let mut input = Input::new("foobar" ); |
583 | /// assert_eq!(Anchored::No, input.get_anchored()); |
584 | /// |
585 | /// input.set_anchored(Anchored::Yes); |
586 | /// assert_eq!(Anchored::Yes, input.get_anchored()); |
587 | /// ``` |
588 | #[inline ] |
589 | pub fn get_anchored(&self) -> Anchored { |
590 | self.anchored |
591 | } |
592 | |
593 | /// Return whether this search should execute in "earliest" mode. |
594 | /// |
595 | /// # Example |
596 | /// |
597 | /// ``` |
598 | /// use aho_corasick::Input; |
599 | /// |
600 | /// let input = Input::new("foobar" ); |
601 | /// assert!(!input.get_earliest()); |
602 | /// ``` |
603 | #[inline ] |
604 | pub fn get_earliest(&self) -> bool { |
605 | self.earliest |
606 | } |
607 | |
608 | /// Return true if this input has been exhausted, which in turn means all |
609 | /// subsequent searches will return no matches. |
610 | /// |
611 | /// This occurs precisely when the start position of this search is greater |
612 | /// than the end position of the search. |
613 | /// |
614 | /// # Example |
615 | /// |
616 | /// ``` |
617 | /// use aho_corasick::Input; |
618 | /// |
619 | /// let mut input = Input::new("foobar" ); |
620 | /// assert!(!input.is_done()); |
621 | /// input.set_start(6); |
622 | /// assert!(!input.is_done()); |
623 | /// input.set_start(7); |
624 | /// assert!(input.is_done()); |
625 | /// ``` |
626 | #[inline ] |
627 | pub fn is_done(&self) -> bool { |
628 | self.get_span().start > self.get_span().end |
629 | } |
630 | } |
631 | |
632 | impl<'h> core::fmt::Debug for Input<'h> { |
633 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
634 | let mut fmter = f.debug_struct("Input" ); |
635 | match core::str::from_utf8(self.haystack()) { |
636 | Ok(nice) => fmter.field("haystack" , &nice), |
637 | Err(_) => fmter.field("haystack" , &self.haystack()), |
638 | } |
639 | .field("span" , &self.span) |
640 | .field("anchored" , &self.anchored) |
641 | .field("earliest" , &self.earliest) |
642 | .finish() |
643 | } |
644 | } |
645 | |
646 | impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> { |
647 | #[inline ] |
648 | fn from(haystack: &'h H) -> Input<'h> { |
649 | Input::new(haystack) |
650 | } |
651 | } |
652 | |
653 | /// A representation of a range in a haystack. |
654 | /// |
655 | /// A span corresponds to the starting and ending _byte offsets_ of a |
656 | /// contiguous region of bytes. The starting offset is inclusive while the |
657 | /// ending offset is exclusive. That is, a span is a half-open interval. |
658 | /// |
659 | /// A span is used to report the offsets of a match, but it is also used to |
660 | /// convey which region of a haystack should be searched via routines like |
661 | /// [`Input::span`]. |
662 | /// |
663 | /// This is basically equivalent to a `std::ops::Range<usize>`, except this |
664 | /// type implements `Copy` which makes it more ergonomic to use in the context |
665 | /// of this crate. Indeed, `Span` exists only because `Range<usize>` does |
666 | /// not implement `Copy`. Like a range, this implements `Index` for `[u8]` |
667 | /// and `str`, and `IndexMut` for `[u8]`. For convenience, this also impls |
668 | /// `From<Range>`, which means things like `Span::from(5..10)` work. |
669 | /// |
670 | /// There are no constraints on the values of a span. It is, for example, legal |
671 | /// to create a span where `start > end`. |
672 | #[derive(Clone, Copy, Eq, Hash, PartialEq)] |
673 | pub struct Span { |
674 | /// The start offset of the span, inclusive. |
675 | pub start: usize, |
676 | /// The end offset of the span, exclusive. |
677 | pub end: usize, |
678 | } |
679 | |
680 | impl Span { |
681 | /// Returns this span as a range. |
682 | #[inline ] |
683 | pub fn range(&self) -> Range<usize> { |
684 | Range::from(*self) |
685 | } |
686 | |
687 | /// Returns true when this span is empty. That is, when `start >= end`. |
688 | #[inline ] |
689 | pub fn is_empty(&self) -> bool { |
690 | self.start >= self.end |
691 | } |
692 | |
693 | /// Returns the length of this span. |
694 | /// |
695 | /// This returns `0` in precisely the cases that `is_empty` returns `true`. |
696 | #[inline ] |
697 | pub fn len(&self) -> usize { |
698 | self.end.saturating_sub(self.start) |
699 | } |
700 | |
701 | /// Returns true when the given offset is contained within this span. |
702 | /// |
703 | /// Note that an empty span contains no offsets and will always return |
704 | /// false. |
705 | #[inline ] |
706 | pub fn contains(&self, offset: usize) -> bool { |
707 | !self.is_empty() && self.start <= offset && offset <= self.end |
708 | } |
709 | |
710 | /// Returns a new span with `offset` added to this span's `start` and `end` |
711 | /// values. |
712 | #[inline ] |
713 | pub fn offset(&self, offset: usize) -> Span { |
714 | Span { start: self.start + offset, end: self.end + offset } |
715 | } |
716 | } |
717 | |
718 | impl core::fmt::Debug for Span { |
719 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
720 | write!(f, "{}..{}" , self.start, self.end) |
721 | } |
722 | } |
723 | |
724 | impl core::ops::Index<Span> for [u8] { |
725 | type Output = [u8]; |
726 | |
727 | #[inline ] |
728 | fn index(&self, index: Span) -> &[u8] { |
729 | &self[index.range()] |
730 | } |
731 | } |
732 | |
733 | impl core::ops::IndexMut<Span> for [u8] { |
734 | #[inline ] |
735 | fn index_mut(&mut self, index: Span) -> &mut [u8] { |
736 | &mut self[index.range()] |
737 | } |
738 | } |
739 | |
740 | impl core::ops::Index<Span> for str { |
741 | type Output = str; |
742 | |
743 | #[inline ] |
744 | fn index(&self, index: Span) -> &str { |
745 | &self[index.range()] |
746 | } |
747 | } |
748 | |
749 | impl From<Range<usize>> for Span { |
750 | #[inline ] |
751 | fn from(range: Range<usize>) -> Span { |
752 | Span { start: range.start, end: range.end } |
753 | } |
754 | } |
755 | |
756 | impl From<Span> for Range<usize> { |
757 | #[inline ] |
758 | fn from(span: Span) -> Range<usize> { |
759 | Range { start: span.start, end: span.end } |
760 | } |
761 | } |
762 | |
763 | impl PartialEq<Range<usize>> for Span { |
764 | #[inline ] |
765 | fn eq(&self, range: &Range<usize>) -> bool { |
766 | self.start == range.start && self.end == range.end |
767 | } |
768 | } |
769 | |
770 | impl PartialEq<Span> for Range<usize> { |
771 | #[inline ] |
772 | fn eq(&self, span: &Span) -> bool { |
773 | self.start == span.start && self.end == span.end |
774 | } |
775 | } |
776 | |
777 | /// The type of anchored search to perform. |
778 | /// |
779 | /// If an Aho-Corasick searcher does not support the anchored mode selected, |
780 | /// then the search will return an error or panic, depending on whether a |
781 | /// fallible or an infallible routine was called. |
782 | #[non_exhaustive ] |
783 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
784 | pub enum Anchored { |
785 | /// Run an unanchored search. This means a match may occur anywhere at or |
786 | /// after the start position of the search up until the end position of the |
787 | /// search. |
788 | No, |
789 | /// Run an anchored search. This means that a match must begin at the start |
790 | /// position of the search and end before the end position of the search. |
791 | Yes, |
792 | } |
793 | |
794 | impl Anchored { |
795 | /// Returns true if and only if this anchor mode corresponds to an anchored |
796 | /// search. |
797 | /// |
798 | /// # Example |
799 | /// |
800 | /// ``` |
801 | /// use aho_corasick::Anchored; |
802 | /// |
803 | /// assert!(!Anchored::No.is_anchored()); |
804 | /// assert!(Anchored::Yes.is_anchored()); |
805 | /// ``` |
806 | #[inline ] |
807 | pub fn is_anchored(&self) -> bool { |
808 | matches!(*self, Anchored::Yes) |
809 | } |
810 | } |
811 | |
812 | /// A representation of a match reported by an Aho-Corasick searcher. |
813 | /// |
814 | /// A match has two essential pieces of information: the [`PatternID`] that |
815 | /// matches, and the [`Span`] of the match in a haystack. |
816 | /// |
817 | /// The pattern is identified by an ID, which corresponds to its position |
818 | /// (starting from `0`) relative to other patterns used to construct the |
819 | /// corresponding searcher. If only a single pattern is provided, then all |
820 | /// matches are guaranteed to have a pattern ID of `0`. |
821 | /// |
822 | /// Every match reported by a searcher guarantees that its span has its start |
823 | /// offset as less than or equal to its end offset. |
824 | #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] |
825 | pub struct Match { |
826 | /// The pattern ID. |
827 | pattern: PatternID, |
828 | /// The underlying match span. |
829 | span: Span, |
830 | } |
831 | |
832 | impl Match { |
833 | /// Create a new match from a pattern ID and a span. |
834 | /// |
835 | /// This constructor is generic over how a span is provided. While |
836 | /// a [`Span`] may be given directly, one may also provide a |
837 | /// `std::ops::Range<usize>`. |
838 | /// |
839 | /// # Panics |
840 | /// |
841 | /// This panics if `end < start`. |
842 | /// |
843 | /// # Example |
844 | /// |
845 | /// This shows how to create a match for the first pattern in an |
846 | /// Aho-Corasick searcher using convenient range syntax. |
847 | /// |
848 | /// ``` |
849 | /// use aho_corasick::{Match, PatternID}; |
850 | /// |
851 | /// let m = Match::new(PatternID::ZERO, 5..10); |
852 | /// assert_eq!(0, m.pattern().as_usize()); |
853 | /// assert_eq!(5, m.start()); |
854 | /// assert_eq!(10, m.end()); |
855 | /// ``` |
856 | #[inline ] |
857 | pub fn new<S: Into<Span>>(pattern: PatternID, span: S) -> Match { |
858 | let span = span.into(); |
859 | assert!(span.start <= span.end, "invalid match span" ); |
860 | Match { pattern, span } |
861 | } |
862 | |
863 | /// Create a new match from a pattern ID and a byte offset span. |
864 | /// |
865 | /// This constructor is generic over how a span is provided. While |
866 | /// a [`Span`] may be given directly, one may also provide a |
867 | /// `std::ops::Range<usize>`. |
868 | /// |
869 | /// This is like [`Match::new`], but accepts a `usize` instead of a |
870 | /// [`PatternID`]. This panics if the given `usize` is not representable |
871 | /// as a `PatternID`. |
872 | /// |
873 | /// # Panics |
874 | /// |
875 | /// This panics if `end < start` or if `pattern > PatternID::MAX`. |
876 | /// |
877 | /// # Example |
878 | /// |
879 | /// This shows how to create a match for the third pattern in an |
880 | /// Aho-Corasick searcher using convenient range syntax. |
881 | /// |
882 | /// ``` |
883 | /// use aho_corasick::Match; |
884 | /// |
885 | /// let m = Match::must(3, 5..10); |
886 | /// assert_eq!(3, m.pattern().as_usize()); |
887 | /// assert_eq!(5, m.start()); |
888 | /// assert_eq!(10, m.end()); |
889 | /// ``` |
890 | #[inline ] |
891 | pub fn must<S: Into<Span>>(pattern: usize, span: S) -> Match { |
892 | Match::new(PatternID::must(pattern), span) |
893 | } |
894 | |
895 | /// Returns the ID of the pattern that matched. |
896 | /// |
897 | /// The ID of a pattern is derived from the position in which it was |
898 | /// originally inserted into the corresponding searcher. The first pattern |
899 | /// has identifier `0`, and each subsequent pattern is `1`, `2` and so on. |
900 | #[inline ] |
901 | pub fn pattern(&self) -> PatternID { |
902 | self.pattern |
903 | } |
904 | |
905 | /// The starting position of the match. |
906 | /// |
907 | /// This is a convenience routine for `Match::span().start`. |
908 | #[inline ] |
909 | pub fn start(&self) -> usize { |
910 | self.span().start |
911 | } |
912 | |
913 | /// The ending position of the match. |
914 | /// |
915 | /// This is a convenience routine for `Match::span().end`. |
916 | #[inline ] |
917 | pub fn end(&self) -> usize { |
918 | self.span().end |
919 | } |
920 | |
921 | /// Returns the match span as a range. |
922 | /// |
923 | /// This is a convenience routine for `Match::span().range()`. |
924 | #[inline ] |
925 | pub fn range(&self) -> core::ops::Range<usize> { |
926 | self.span().range() |
927 | } |
928 | |
929 | /// Returns the span for this match. |
930 | #[inline ] |
931 | pub fn span(&self) -> Span { |
932 | self.span |
933 | } |
934 | |
935 | /// Returns true when the span in this match is empty. |
936 | /// |
937 | /// An empty match can only be returned when empty pattern is in the |
938 | /// Aho-Corasick searcher. |
939 | #[inline ] |
940 | pub fn is_empty(&self) -> bool { |
941 | self.span().is_empty() |
942 | } |
943 | |
944 | /// Returns the length of this match. |
945 | /// |
946 | /// This returns `0` in precisely the cases that `is_empty` returns `true`. |
947 | #[inline ] |
948 | pub fn len(&self) -> usize { |
949 | self.span().len() |
950 | } |
951 | |
952 | /// Returns a new match with `offset` added to its span's `start` and `end` |
953 | /// values. |
954 | #[inline ] |
955 | pub fn offset(&self, offset: usize) -> Match { |
956 | Match { |
957 | pattern: self.pattern, |
958 | span: Span { |
959 | start: self.start() + offset, |
960 | end: self.end() + offset, |
961 | }, |
962 | } |
963 | } |
964 | } |
965 | |
966 | /// A knob for controlling the match semantics of an Aho-Corasick automaton. |
967 | /// |
968 | /// There are two generally different ways that Aho-Corasick automatons can |
969 | /// report matches. The first way is the "standard" approach that results from |
970 | /// implementing most textbook explanations of Aho-Corasick. The second way is |
971 | /// to report only the leftmost non-overlapping matches. The leftmost approach |
972 | /// is in turn split into two different ways of resolving ambiguous matches: |
973 | /// leftmost-first and leftmost-longest. |
974 | /// |
975 | /// The `Standard` match kind is the default and is the only one that supports |
976 | /// overlapping matches and stream searching. (Trying to find overlapping or |
977 | /// streaming matches using leftmost match semantics will result in an error in |
978 | /// fallible APIs and a panic when using infallibe APIs.) The `Standard` match |
979 | /// kind will report matches as they are seen. When searching for overlapping |
980 | /// matches, then all possible matches are reported. When searching for |
981 | /// non-overlapping matches, the first match seen is reported. For example, for |
982 | /// non-overlapping matches, given the patterns `abcd` and `b` and the haystack |
983 | /// `abcdef`, only a match for `b` is reported since it is detected first. The |
984 | /// `abcd` match is never reported since it overlaps with the `b` match. |
985 | /// |
986 | /// In contrast, the leftmost match kind always prefers the leftmost match |
987 | /// among all possible matches. Given the same example as above with `abcd` and |
988 | /// `b` as patterns and `abcdef` as the haystack, the leftmost match is `abcd` |
989 | /// since it begins before the `b` match, even though the `b` match is detected |
990 | /// before the `abcd` match. In this case, the `b` match is not reported at all |
991 | /// since it overlaps with the `abcd` match. |
992 | /// |
993 | /// The difference between leftmost-first and leftmost-longest is in how they |
994 | /// resolve ambiguous matches when there are multiple leftmost matches to |
995 | /// choose from. Leftmost-first always chooses the pattern that was provided |
996 | /// earliest, where as leftmost-longest always chooses the longest matching |
997 | /// pattern. For example, given the patterns `a` and `ab` and the subject |
998 | /// string `ab`, the leftmost-first match is `a` but the leftmost-longest match |
999 | /// is `ab`. Conversely, if the patterns were given in reverse order, i.e., |
1000 | /// `ab` and `a`, then both the leftmost-first and leftmost-longest matches |
1001 | /// would be `ab`. Stated differently, the leftmost-first match depends on the |
1002 | /// order in which the patterns were given to the Aho-Corasick automaton. |
1003 | /// Because of that, when leftmost-first matching is used, if a pattern `A` |
1004 | /// that appears before a pattern `B` is a prefix of `B`, then it is impossible |
1005 | /// to ever observe a match of `B`. |
1006 | /// |
1007 | /// If you're not sure which match kind to pick, then stick with the standard |
1008 | /// kind, which is the default. In particular, if you need overlapping or |
1009 | /// streaming matches, then you _must_ use the standard kind. The leftmost |
1010 | /// kinds are useful in specific circumstances. For example, leftmost-first can |
1011 | /// be very useful as a way to implement match priority based on the order of |
1012 | /// patterns given and leftmost-longest can be useful for dictionary searching |
1013 | /// such that only the longest matching words are reported. |
1014 | /// |
1015 | /// # Relationship with regular expression alternations |
1016 | /// |
1017 | /// Understanding match semantics can be a little tricky, and one easy way |
1018 | /// to conceptualize non-overlapping matches from an Aho-Corasick automaton |
1019 | /// is to think about them as a simple alternation of literals in a regular |
1020 | /// expression. For example, let's say we wanted to match the strings |
1021 | /// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It |
1022 | /// turns out that regular expression engines have two different ways of |
1023 | /// matching this alternation. The first way, leftmost-longest, is commonly |
1024 | /// found in POSIX compatible implementations of regular expressions (such as |
1025 | /// `grep`). The second way, leftmost-first, is commonly found in backtracking |
1026 | /// implementations such as Perl. (Some regex engines, such as RE2 and Rust's |
1027 | /// regex engine do not use backtracking, but still implement leftmost-first |
1028 | /// semantics in an effort to match the behavior of dominant backtracking |
1029 | /// regex engines such as those found in Perl, Ruby, Python, Javascript and |
1030 | /// PHP.) |
1031 | /// |
1032 | /// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex |
1033 | /// will match `Samwise` because it is the longest possible match, but a |
1034 | /// Perl-like regex will match `Sam` since it appears earlier in the |
1035 | /// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine |
1036 | /// will never match `Samwise` since `Sam` will always have higher priority. |
1037 | /// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to |
1038 | /// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is |
1039 | /// still longest match, but it also appears earlier than `Sam`. |
1040 | /// |
1041 | /// The "standard" match semantics of Aho-Corasick generally don't correspond |
1042 | /// to the match semantics of any large group of regex implementations, so |
1043 | /// there's no direct analogy that can be made here. Standard match semantics |
1044 | /// are generally useful for overlapping matches, or if you just want to see |
1045 | /// matches as they are detected. |
1046 | /// |
1047 | /// The main conclusion to draw from this section is that the match semantics |
1048 | /// can be tweaked to precisely match either Perl-like regex alternations or |
1049 | /// POSIX regex alternations. |
1050 | #[non_exhaustive ] |
1051 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
1052 | pub enum MatchKind { |
1053 | /// Use standard match semantics, which support overlapping matches. When |
1054 | /// used with non-overlapping matches, matches are reported as they are |
1055 | /// seen. |
1056 | Standard, |
1057 | /// Use leftmost-first match semantics, which reports leftmost matches. |
1058 | /// When there are multiple possible leftmost matches, the match |
1059 | /// corresponding to the pattern that appeared earlier when constructing |
1060 | /// the automaton is reported. |
1061 | /// |
1062 | /// This does **not** support overlapping matches or stream searching. If |
1063 | /// this match kind is used, attempting to find overlapping matches or |
1064 | /// stream matches will fail. |
1065 | LeftmostFirst, |
1066 | /// Use leftmost-longest match semantics, which reports leftmost matches. |
1067 | /// When there are multiple possible leftmost matches, the longest match |
1068 | /// is chosen. |
1069 | /// |
1070 | /// This does **not** support overlapping matches or stream searching. If |
1071 | /// this match kind is used, attempting to find overlapping matches or |
1072 | /// stream matches will fail. |
1073 | LeftmostLongest, |
1074 | } |
1075 | |
1076 | /// The default match kind is `MatchKind::Standard`. |
1077 | impl Default for MatchKind { |
1078 | fn default() -> MatchKind { |
1079 | MatchKind::Standard |
1080 | } |
1081 | } |
1082 | |
1083 | impl MatchKind { |
1084 | #[inline ] |
1085 | pub(crate) fn is_standard(&self) -> bool { |
1086 | matches!(*self, MatchKind::Standard) |
1087 | } |
1088 | |
1089 | #[inline ] |
1090 | pub(crate) fn is_leftmost(&self) -> bool { |
1091 | matches!(*self, MatchKind::LeftmostFirst | MatchKind::LeftmostLongest) |
1092 | } |
1093 | |
1094 | #[inline ] |
1095 | pub(crate) fn is_leftmost_first(&self) -> bool { |
1096 | matches!(*self, MatchKind::LeftmostFirst) |
1097 | } |
1098 | |
1099 | /// Convert this match kind into a packed match kind. If this match kind |
1100 | /// corresponds to standard semantics, then this returns None, since |
1101 | /// packed searching does not support standard semantics. |
1102 | #[inline ] |
1103 | pub(crate) fn as_packed(&self) -> Option<crate::packed::MatchKind> { |
1104 | match *self { |
1105 | MatchKind::Standard => None, |
1106 | MatchKind::LeftmostFirst => { |
1107 | Some(crate::packed::MatchKind::LeftmostFirst) |
1108 | } |
1109 | MatchKind::LeftmostLongest => { |
1110 | Some(crate::packed::MatchKind::LeftmostLongest) |
1111 | } |
1112 | } |
1113 | } |
1114 | } |
1115 | |
1116 | /// The kind of anchored starting configurations to support in an Aho-Corasick |
1117 | /// searcher. |
1118 | /// |
1119 | /// Depending on which searcher is used internally by |
1120 | /// [`AhoCorasick`](crate::AhoCorasick), supporting both unanchored |
1121 | /// and anchored searches can be quite costly. For this reason, |
1122 | /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind) |
1123 | /// can be used to configure whether your searcher supports unanchored, |
1124 | /// anchored or both kinds of searches. |
1125 | /// |
1126 | /// This searcher configuration knob works in concert with the search time |
1127 | /// configuration [`Input::anchored`]. Namely, if one requests an unsupported |
1128 | /// anchored mode, then the search will either panic or return an error, |
1129 | /// depending on whether you're using infallible or fallibe APIs, respectively. |
1130 | /// |
1131 | /// `AhoCorasick` by default only supports unanchored searches. |
1132 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
1133 | pub enum StartKind { |
1134 | /// Support both anchored and unanchored searches. |
1135 | Both, |
1136 | /// Support only unanchored searches. Requesting an anchored search will |
1137 | /// return an error in fallible APIs and panic in infallible APIs. |
1138 | Unanchored, |
1139 | /// Support only anchored searches. Requesting an unanchored search will |
1140 | /// return an error in fallible APIs and panic in infallible APIs. |
1141 | Anchored, |
1142 | } |
1143 | |
1144 | impl Default for StartKind { |
1145 | fn default() -> StartKind { |
1146 | StartKind::Unanchored |
1147 | } |
1148 | } |
1149 | |