1 | use core::ops::{Range, RangeBounds}; |
---|---|

2 | |

3 | use crate::util::primitives::PatternID; |

4 | |

5 | /// The configuration and the haystack to use for an Aho-Corasick search. |

6 | /// |

7 | /// When executing a search, there are a few parameters one might want to |

8 | /// configure: |

9 | /// |

10 | /// * The haystack to search, provided to the [`Input::new`] constructor. This |

11 | /// is the only required parameter. |

12 | /// * The span _within_ the haystack to limit a search to. (The default |

13 | /// is the entire haystack.) This is configured via [`Input::span`] or |

14 | /// [`Input::range`]. |

15 | /// * Whether to run an unanchored (matches can occur anywhere after the |

16 | /// start of the search) or anchored (matches can only occur beginning at |

17 | /// the start of the search) search. Unanchored search is the default. This is |

18 | /// configured via [`Input::anchored`]. |

19 | /// * Whether to quit the search as soon as a match has been found, regardless |

20 | /// of the [`MatchKind`] that the searcher was built with. This is configured |

21 | /// via [`Input::earliest`]. |

22 | /// |

23 | /// For most cases, the defaults for all optional parameters are appropriate. |

24 | /// The utility of this type is that it keeps the default or common case simple |

25 | /// while permitting tweaking parameters in more niche use cases while reusing |

26 | /// the same search APIs. |

27 | /// |

28 | /// # Valid bounds and search termination |

29 | /// |

30 | /// An `Input` permits setting the bounds of a search via either |

31 | /// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or |

32 | /// else a panic will occur. Bounds are valid if and only if: |

33 | /// |

34 | /// * The bounds represent a valid range into the input's haystack. |

35 | /// * **or** the end bound is a valid ending bound for the haystack *and* |

36 | /// the start bound is exactly one greater than the end bound. |

37 | /// |

38 | /// In the latter case, [`Input::is_done`] will return true and indicates any |

39 | /// search receiving such an input should immediately return with no match. |

40 | /// |

41 | /// Other than representing "search is complete," the `Input::span` and |

42 | /// `Input::range` APIs are never necessary. Instead, callers can slice the |

43 | /// haystack instead, e.g., with `&haystack[start..end]`. With that said, they |

44 | /// can be more convenient than slicing because the match positions reported |

45 | /// when using `Input::span` or `Input::range` are in terms of the original |

46 | /// haystack. If you instead use `&haystack[start..end]`, then you'll need to |

47 | /// add `start` to any match position returned in order for it to be a correct |

48 | /// index into `haystack`. |

49 | /// |

50 | /// # Example: `&str` and `&[u8]` automatically convert to an `Input` |

51 | /// |

52 | /// There is a `From<&T> for Input` implementation for all `T: AsRef<[u8]>`. |

53 | /// Additionally, the [`AhoCorasick`](crate::AhoCorasick) search APIs accept |

54 | /// a `Into<Input>`. These two things combined together mean you can provide |

55 | /// things like `&str` and `&[u8]` to search APIs when the defaults are |

56 | /// suitable, but also an `Input` when they're not. For example: |

57 | /// |

58 | /// ``` |

59 | /// use aho_corasick::{AhoCorasick, Anchored, Input, Match, StartKind}; |

60 | /// |

61 | /// // Build a searcher that supports both unanchored and anchored modes. |

62 | /// let ac = AhoCorasick::builder() |

63 | /// .start_kind(StartKind::Both) |

64 | /// .build(&["abcd", "b"]) |

65 | /// .unwrap(); |

66 | /// let haystack = "abcd"; |

67 | /// |

68 | /// // A search using default parameters is unanchored. With standard |

69 | /// // semantics, this finds `b` first. |

70 | /// assert_eq!( |

71 | /// Some(Match::must(1, 1..2)), |

72 | /// ac.find(haystack), |

73 | /// ); |

74 | /// // Using the same 'find' routine, we can provide an 'Input' explicitly |

75 | /// // that is configured to do an anchored search. Since 'b' doesn't start |

76 | /// // at the beginning of the search, it is not reported as a match. |

77 | /// assert_eq!( |

78 | /// Some(Match::must(0, 0..4)), |

79 | /// ac.find(Input::new(haystack).anchored(Anchored::Yes)), |

80 | /// ); |

81 | /// ``` |

82 | #[derive(Clone)] |

83 | pub struct Input<'h> { |

84 | haystack: &'h [u8], |

85 | span: Span, |

86 | anchored: Anchored, |

87 | earliest: bool, |

88 | } |

89 | |

90 | impl<'h> Input<'h> { |

91 | /// Create a new search configuration for the given haystack. |

92 | #[inline] |

93 | pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> { |

94 | Input { |

95 | haystack: haystack.as_ref(), |

96 | span: Span { start: 0, end: haystack.as_ref().len() }, |

97 | anchored: Anchored::No, |

98 | earliest: false, |

99 | } |

100 | } |

101 | |

102 | /// Set the span for this search. |

103 | /// |

104 | /// This routine is generic over how a span is provided. While |

105 | /// a [`Span`] may be given directly, one may also provide a |

106 | /// `std::ops::Range<usize>`. To provide anything supported by range |

107 | /// syntax, use the [`Input::range`] method. |

108 | /// |

109 | /// The default span is the entire haystack. |

110 | /// |

111 | /// Note that [`Input::range`] overrides this method and vice versa. |

112 | /// |

113 | /// # Panics |

114 | /// |

115 | /// This panics if the given span does not correspond to valid bounds in |

116 | /// the haystack or the termination of a search. |

117 | /// |

118 | /// # Example |

119 | /// |

120 | /// This example shows how the span of the search can impact whether a |

121 | /// match is reported or not. |

122 | /// |

123 | /// ``` |

124 | /// use aho_corasick::{AhoCorasick, Input, MatchKind}; |

125 | /// |

126 | /// let patterns = &["b", "abcd", "abc"]; |

127 | /// let haystack = "abcd"; |

128 | /// |

129 | /// let ac = AhoCorasick::builder() |

130 | /// .match_kind(MatchKind::LeftmostFirst) |

131 | /// .build(patterns) |

132 | /// .unwrap(); |

133 | /// let input = Input::new(haystack).span(0..3); |

134 | /// let mat = ac.try_find(input)?.expect("should have a match"); |

135 | /// // Without the span stopping the search early, 'abcd' would be reported |

136 | /// // because it is the correct leftmost-first match. |

137 | /// assert_eq!("abc", &haystack[mat.span()]); |

138 | /// |

139 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |

140 | /// ``` |

141 | #[inline] |

142 | pub fn span<S: Into<Span>>(mut self, span: S) -> Input<'h> { |

143 | self.set_span(span); |

144 | self |

145 | } |

146 | |

147 | /// Like `Input::span`, but accepts any range instead. |

148 | /// |

149 | /// The default range is the entire haystack. |

150 | /// |

151 | /// Note that [`Input::span`] overrides this method and vice versa. |

152 | /// |

153 | /// # Panics |

154 | /// |

155 | /// This routine will panic if the given range could not be converted |

156 | /// to a valid [`Range`]. For example, this would panic when given |

157 | /// `0..=usize::MAX` since it cannot be represented using a half-open |

158 | /// interval in terms of `usize`. |

159 | /// |

160 | /// This routine also panics if the given range does not correspond to |

161 | /// valid bounds in the haystack or the termination of a search. |

162 | /// |

163 | /// # Example |

164 | /// |

165 | /// ``` |

166 | /// use aho_corasick::Input; |

167 | /// |

168 | /// let input = Input::new("foobar"); |

169 | /// assert_eq!(0..6, input.get_range()); |

170 | /// |

171 | /// let input = Input::new("foobar").range( 2..=4); |

172 | /// assert_eq!(2..5, input.get_range()); |

173 | /// ``` |

174 | #[inline] |

175 | pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h> { |

176 | self.set_range(range); |

177 | self |

178 | } |

179 | |

180 | /// Sets the anchor mode of a search. |

181 | /// |

182 | /// When a search is anchored (via [`Anchored::Yes`]), a match must begin |

183 | /// at the start of a search. When a search is not anchored (that's |

184 | /// [`Anchored::No`]), searchers will look for a match anywhere in the |

185 | /// haystack. |

186 | /// |

187 | /// By default, the anchored mode is [`Anchored::No`]. |

188 | /// |

189 | /// # Support for anchored searches |

190 | /// |

191 | /// Anchored or unanchored searches might not always be available, |

192 | /// depending on the type of searcher used and its configuration: |

193 | /// |

194 | /// * [`noncontiguous::NFA`](crate::nfa::noncontiguous::NFA) always |

195 | /// supports both unanchored and anchored searches. |

196 | /// * [`contiguous::NFA`](crate::nfa::contiguous::NFA) always supports both |

197 | /// unanchored and anchored searches. |

198 | /// * [`dfa::DFA`](crate::dfa::DFA) supports only unanchored |

199 | /// searches by default. |

200 | /// [`dfa::Builder::start_kind`](crate::dfa::Builder::start_kind) can |

201 | /// be used to change the default to supporting both kinds of searches |

202 | /// or even just anchored searches. |

203 | /// * [`AhoCorasick`](crate::AhoCorasick) inherits the same setup as a |

204 | /// `DFA`. Namely, it only supports unanchored searches by default, but |

205 | /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind) |

206 | /// can change this. |

207 | /// |

208 | /// If you try to execute a search using a `try_` ("fallible") method with |

209 | /// an unsupported anchor mode, then an error will be returned. For calls |

210 | /// to infallible search methods, a panic will result. |

211 | /// |

212 | /// # Example |

213 | /// |

214 | /// This demonstrates the differences between an anchored search and |

215 | /// an unanchored search. Notice that we build our `AhoCorasick` searcher |

216 | /// with [`StartKind::Both`] so that it supports both unanchored and |

217 | /// anchored searches simultaneously. |

218 | /// |

219 | /// ``` |

220 | /// use aho_corasick::{ |

221 | /// AhoCorasick, Anchored, Input, MatchKind, StartKind, |

222 | /// }; |

223 | /// |

224 | /// let patterns = &["bcd"]; |

225 | /// let haystack = "abcd"; |

226 | /// |

227 | /// let ac = AhoCorasick::builder() |

228 | /// .start_kind(StartKind::Both) |

229 | /// .build(patterns) |

230 | /// .unwrap(); |

231 | /// |

232 | /// // Note that 'Anchored::No' is the default, so it doesn't need to |

233 | /// // be explicitly specified here. |

234 | /// let input = Input::new(haystack); |

235 | /// let mat = ac.try_find(input)?.expect("should have a match"); |

236 | /// assert_eq!("bcd", &haystack[mat.span()]); |

237 | /// |

238 | /// // While 'bcd' occurs in the haystack, it does not begin where our |

239 | /// // search begins, so no match is found. |

240 | /// let input = Input::new(haystack).anchored(Anchored::Yes); |

241 | /// assert_eq!(None, ac.try_find(input)?); |

242 | /// |

243 | /// // However, if we start our search where 'bcd' starts, then we will |

244 | /// // find a match. |

245 | /// let input = Input::new(haystack).range(1..).anchored(Anchored::Yes); |

246 | /// let mat = ac.try_find(input)?.expect("should have a match"); |

247 | /// assert_eq!("bcd", &haystack[mat.span()]); |

248 | /// |

249 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |

250 | /// ``` |

251 | #[inline] |

252 | pub fn anchored(mut self, mode: Anchored) -> Input<'h> { |

253 | self.set_anchored(mode); |

254 | self |

255 | } |

256 | |

257 | /// Whether to execute an "earliest" search or not. |

258 | /// |

259 | /// When running a non-overlapping search, an "earliest" search will |

260 | /// return the match location as early as possible. For example, given |

261 | /// the patterns `abc` and `b`, and a haystack of `abc`, a normal |

262 | /// leftmost-first search will return `abc` as a match. But an "earliest" |

263 | /// search will return as soon as it is known that a match occurs, which |

264 | /// happens once `b` is seen. |

265 | /// |

266 | /// Note that when using [`MatchKind::Standard`], the "earliest" option |

267 | /// has no effect since standard semantics are already "earliest." Note |

268 | /// also that this has no effect in overlapping searches, since overlapping |

269 | /// searches also use standard semantics and report all possible matches. |

270 | /// |

271 | /// This is disabled by default. |

272 | /// |

273 | /// # Example |

274 | /// |

275 | /// This example shows the difference between "earliest" searching and |

276 | /// normal leftmost searching. |

277 | /// |

278 | /// ``` |

279 | /// use aho_corasick::{AhoCorasick, Anchored, Input, MatchKind, StartKind}; |

280 | /// |

281 | /// let patterns = &["abc", "b"]; |

282 | /// let haystack = "abc"; |

283 | /// |

284 | /// let ac = AhoCorasick::builder() |

285 | /// .match_kind(MatchKind::LeftmostFirst) |

286 | /// .build(patterns) |

287 | /// .unwrap(); |

288 | /// |

289 | /// // The normal leftmost-first match. |

290 | /// let input = Input::new(haystack); |

291 | /// let mat = ac.try_find(input)?.expect("should have a match"); |

292 | /// assert_eq!("abc", &haystack[mat.span()]); |

293 | /// |

294 | /// // The "earliest" possible match, even if it isn't leftmost-first. |

295 | /// let input = Input::new(haystack).earliest(true); |

296 | /// let mat = ac.try_find(input)?.expect("should have a match"); |

297 | /// assert_eq!("b", &haystack[mat.span()]); |

298 | /// |

299 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |

300 | /// ``` |

301 | #[inline] |

302 | pub fn earliest(mut self, yes: bool) -> Input<'h> { |

303 | self.set_earliest(yes); |

304 | self |

305 | } |

306 | |

307 | /// Set the span for this search configuration. |

308 | /// |

309 | /// This is like the [`Input::span`] method, except this mutates the |

310 | /// span in place. |

311 | /// |

312 | /// This routine is generic over how a span is provided. While |

313 | /// a [`Span`] may be given directly, one may also provide a |

314 | /// `std::ops::Range<usize>`. |

315 | /// |

316 | /// # Panics |

317 | /// |

318 | /// This panics if the given span does not correspond to valid bounds in |

319 | /// the haystack or the termination of a search. |

320 | /// |

321 | /// # Example |

322 | /// |

323 | /// ``` |

324 | /// use aho_corasick::Input; |

325 | /// |

326 | /// let mut input = Input::new("foobar"); |

327 | /// assert_eq!(0..6, input.get_range()); |

328 | /// input.set_span(2..4); |

329 | /// assert_eq!(2..4, input.get_range()); |

330 | /// ``` |

331 | #[inline] |

332 | pub fn set_span<S: Into<Span>>(&mut self, span: S) { |

333 | let span = span.into(); |

334 | assert!( |

335 | span.end <= self.haystack.len() |

336 | && span.start <= span.end.wrapping_add(1), |

337 | "invalid span {:?} for haystack of length {}", |

338 | span, |

339 | self.haystack.len(), |

340 | ); |

341 | self.span = span; |

342 | } |

343 | |

344 | /// Set the span for this search configuration given any range. |

345 | /// |

346 | /// This is like the [`Input::range`] method, except this mutates the |

347 | /// span in place. |

348 | /// |

349 | /// # Panics |

350 | /// |

351 | /// This routine will panic if the given range could not be converted |

352 | /// to a valid [`Range`]. For example, this would panic when given |

353 | /// `0..=usize::MAX` since it cannot be represented using a half-open |

354 | /// interval in terms of `usize`. |

355 | /// |

356 | /// This routine also panics if the given range does not correspond to |

357 | /// valid bounds in the haystack or the termination of a search. |

358 | /// |

359 | /// # Example |

360 | /// |

361 | /// ``` |

362 | /// use aho_corasick::Input; |

363 | /// |

364 | /// let mut input = Input::new("foobar"); |

365 | /// assert_eq!(0..6, input.get_range()); |

366 | /// input.set_range(2..=4); |

367 | /// assert_eq!(2..5, input.get_range()); |

368 | /// ``` |

369 | #[inline] |

370 | pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) { |

371 | use core::ops::Bound; |

372 | |

373 | // It's a little weird to convert ranges into spans, and then spans |

374 | // back into ranges when we actually slice the haystack. Because |

375 | // of that process, we always represent everything as a half-open |

376 | // internal. Therefore, handling things like m..=n is a little awkward. |

377 | let start = match range.start_bound() { |

378 | Bound::Included(&i) => i, |

379 | // Can this case ever happen? Range syntax doesn't support it... |

380 | Bound::Excluded(&i) => i.checked_add(1).unwrap(), |

381 | Bound::Unbounded => 0, |

382 | }; |

383 | let end = match range.end_bound() { |

384 | Bound::Included(&i) => i.checked_add(1).unwrap(), |

385 | Bound::Excluded(&i) => i, |

386 | Bound::Unbounded => self.haystack().len(), |

387 | }; |

388 | self.set_span(Span { start, end }); |

389 | } |

390 | |

391 | /// Set the starting offset for the span for this search configuration. |

392 | /// |

393 | /// This is a convenience routine for only mutating the start of a span |

394 | /// without having to set the entire span. |

395 | /// |

396 | /// # Panics |

397 | /// |

398 | /// This panics if the given span does not correspond to valid bounds in |

399 | /// the haystack or the termination of a search. |

400 | /// |

401 | /// # Example |

402 | /// |

403 | /// ``` |

404 | /// use aho_corasick::Input; |

405 | /// |

406 | /// let mut input = Input::new("foobar"); |

407 | /// assert_eq!(0..6, input.get_range()); |

408 | /// input.set_start(5); |

409 | /// assert_eq!(5..6, input.get_range()); |

410 | /// ``` |

411 | #[inline] |

412 | pub fn set_start(&mut self, start: usize) { |

413 | self.set_span(Span { start, ..self.get_span() }); |

414 | } |

415 | |

416 | /// Set the ending offset for the span for this search configuration. |

417 | /// |

418 | /// This is a convenience routine for only mutating the end of a span |

419 | /// without having to set the entire span. |

420 | /// |

421 | /// # Panics |

422 | /// |

423 | /// This panics if the given span does not correspond to valid bounds in |

424 | /// the haystack or the termination of a search. |

425 | /// |

426 | /// # Example |

427 | /// |

428 | /// ``` |

429 | /// use aho_corasick::Input; |

430 | /// |

431 | /// let mut input = Input::new("foobar"); |

432 | /// assert_eq!(0..6, input.get_range()); |

433 | /// input.set_end(5); |

434 | /// assert_eq!(0..5, input.get_range()); |

435 | /// ``` |

436 | #[inline] |

437 | pub fn set_end(&mut self, end: usize) { |

438 | self.set_span(Span { end, ..self.get_span() }); |

439 | } |

440 | |

441 | /// Set the anchor mode of a search. |

442 | /// |

443 | /// This is like [`Input::anchored`], except it mutates the search |

444 | /// configuration in place. |

445 | /// |

446 | /// # Example |

447 | /// |

448 | /// ``` |

449 | /// use aho_corasick::{Anchored, Input}; |

450 | /// |

451 | /// let mut input = Input::new("foobar"); |

452 | /// assert_eq!(Anchored::No, input.get_anchored()); |

453 | /// |

454 | /// input.set_anchored(Anchored::Yes); |

455 | /// assert_eq!(Anchored::Yes, input.get_anchored()); |

456 | /// ``` |

457 | #[inline] |

458 | pub fn set_anchored(&mut self, mode: Anchored) { |

459 | self.anchored = mode; |

460 | } |

461 | |

462 | /// Set whether the search should execute in "earliest" mode or not. |

463 | /// |

464 | /// This is like [`Input::earliest`], except it mutates the search |

465 | /// configuration in place. |

466 | /// |

467 | /// # Example |

468 | /// |

469 | /// ``` |

470 | /// use aho_corasick::Input; |

471 | /// |

472 | /// let mut input = Input::new("foobar"); |

473 | /// assert!(!input.get_earliest()); |

474 | /// input.set_earliest(true); |

475 | /// assert!(input.get_earliest()); |

476 | /// ``` |

477 | #[inline] |

478 | pub fn set_earliest(&mut self, yes: bool) { |

479 | self.earliest = yes; |

480 | } |

481 | |

482 | /// Return a borrow of the underlying haystack as a slice of bytes. |

483 | /// |

484 | /// # Example |

485 | /// |

486 | /// ``` |

487 | /// use aho_corasick::Input; |

488 | /// |

489 | /// let input = Input::new("foobar"); |

490 | /// assert_eq!(b"foobar", input.haystack()); |

491 | /// ``` |

492 | #[inline] |

493 | pub fn haystack(&self) -> &[u8] { |

494 | self.haystack |

495 | } |

496 | |

497 | /// Return the start position of this search. |

498 | /// |

499 | /// This is a convenience routine for `search.get_span().start()`. |

500 | /// |

501 | /// # Example |

502 | /// |

503 | /// ``` |

504 | /// use aho_corasick::Input; |

505 | /// |

506 | /// let input = Input::new("foobar"); |

507 | /// assert_eq!(0, input.start()); |

508 | /// |

509 | /// let input = Input::new("foobar").span( 2..4); |

510 | /// assert_eq!(2, input.start()); |

511 | /// ``` |

512 | #[inline] |

513 | pub fn start(&self) -> usize { |

514 | self.get_span().start |

515 | } |

516 | |

517 | /// Return the end position of this search. |

518 | /// |

519 | /// This is a convenience routine for `search.get_span().end()`. |

520 | /// |

521 | /// # Example |

522 | /// |

523 | /// ``` |

524 | /// use aho_corasick::Input; |

525 | /// |

526 | /// let input = Input::new("foobar"); |

527 | /// assert_eq!(6, input.end()); |

528 | /// |

529 | /// let input = Input::new("foobar").span( 2..4); |

530 | /// assert_eq!(4, input.end()); |

531 | /// ``` |

532 | #[inline] |

533 | pub fn end(&self) -> usize { |

534 | self.get_span().end |

535 | } |

536 | |

537 | /// Return the span for this search configuration. |

538 | /// |

539 | /// If one was not explicitly set, then the span corresponds to the entire |

540 | /// range of the haystack. |

541 | /// |

542 | /// # Example |

543 | /// |

544 | /// ``` |

545 | /// use aho_corasick::{Input, Span}; |

546 | /// |

547 | /// let input = Input::new("foobar"); |

548 | /// assert_eq!(Span { start: 0, end: 6 }, input.get_span()); |

549 | /// ``` |

550 | #[inline] |

551 | pub fn get_span(&self) -> Span { |

552 | self.span |

553 | } |

554 | |

555 | /// Return the span as a range for this search configuration. |

556 | /// |

557 | /// If one was not explicitly set, then the span corresponds to the entire |

558 | /// range of the haystack. |

559 | /// |

560 | /// # Example |

561 | /// |

562 | /// ``` |

563 | /// use aho_corasick::Input; |

564 | /// |

565 | /// let input = Input::new("foobar"); |

566 | /// assert_eq!(0..6, input.get_range()); |

567 | /// ``` |

568 | #[inline] |

569 | pub fn get_range(&self) -> Range<usize> { |

570 | self.get_span().range() |

571 | } |

572 | |

573 | /// Return the anchored mode for this search configuration. |

574 | /// |

575 | /// If no anchored mode was set, then it defaults to [`Anchored::No`]. |

576 | /// |

577 | /// # Example |

578 | /// |

579 | /// ``` |

580 | /// use aho_corasick::{Anchored, Input}; |

581 | /// |

582 | /// let mut input = Input::new("foobar"); |

583 | /// assert_eq!(Anchored::No, input.get_anchored()); |

584 | /// |

585 | /// input.set_anchored(Anchored::Yes); |

586 | /// assert_eq!(Anchored::Yes, input.get_anchored()); |

587 | /// ``` |

588 | #[inline] |

589 | pub fn get_anchored(&self) -> Anchored { |

590 | self.anchored |

591 | } |

592 | |

593 | /// Return whether this search should execute in "earliest" mode. |

594 | /// |

595 | /// # Example |

596 | /// |

597 | /// ``` |

598 | /// use aho_corasick::Input; |

599 | /// |

600 | /// let input = Input::new("foobar"); |

601 | /// assert!(!input.get_earliest()); |

602 | /// ``` |

603 | #[inline] |

604 | pub fn get_earliest(&self) -> bool { |

605 | self.earliest |

606 | } |

607 | |

608 | /// Return true if this input has been exhausted, which in turn means all |

609 | /// subsequent searches will return no matches. |

610 | /// |

611 | /// This occurs precisely when the start position of this search is greater |

612 | /// than the end position of the search. |

613 | /// |

614 | /// # Example |

615 | /// |

616 | /// ``` |

617 | /// use aho_corasick::Input; |

618 | /// |

619 | /// let mut input = Input::new("foobar"); |

620 | /// assert!(!input.is_done()); |

621 | /// input.set_start(6); |

622 | /// assert!(!input.is_done()); |

623 | /// input.set_start(7); |

624 | /// assert!(input.is_done()); |

625 | /// ``` |

626 | #[inline] |

627 | pub fn is_done(&self) -> bool { |

628 | self.get_span().start > self.get_span().end |

629 | } |

630 | } |

631 | |

632 | impl<'h> core::fmt::Debug for Input<'h> { |

633 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |

634 | let mut fmter = f.debug_struct("Input"); |

635 | match core::str::from_utf8(self.haystack()) { |

636 | Ok(nice) => fmter.field("haystack", &nice), |

637 | Err(_) => fmter.field("haystack", &self.haystack()), |

638 | } |

639 | .field("span", &self.span) |

640 | .field("anchored", &self.anchored) |

641 | .field("earliest", &self.earliest) |

642 | .finish() |

643 | } |

644 | } |

645 | |

646 | impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> { |

647 | #[inline] |

648 | fn from(haystack: &'h H) -> Input<'h> { |

649 | Input::new(haystack) |

650 | } |

651 | } |

652 | |

653 | /// A representation of a range in a haystack. |

654 | /// |

655 | /// A span corresponds to the starting and ending _byte offsets_ of a |

656 | /// contiguous region of bytes. The starting offset is inclusive while the |

657 | /// ending offset is exclusive. That is, a span is a half-open interval. |

658 | /// |

659 | /// A span is used to report the offsets of a match, but it is also used to |

660 | /// convey which region of a haystack should be searched via routines like |

661 | /// [`Input::span`]. |

662 | /// |

663 | /// This is basically equivalent to a `std::ops::Range<usize>`, except this |

664 | /// type implements `Copy` which makes it more ergonomic to use in the context |

665 | /// of this crate. Indeed, `Span` exists only because `Range<usize>` does |

666 | /// not implement `Copy`. Like a range, this implements `Index` for `[u8]` |

667 | /// and `str`, and `IndexMut` for `[u8]`. For convenience, this also impls |

668 | /// `From<Range>`, which means things like `Span::from(5..10)` work. |

669 | /// |

670 | /// There are no constraints on the values of a span. It is, for example, legal |

671 | /// to create a span where `start > end`. |

672 | #[derive(Clone, Copy, Eq, Hash, PartialEq)] |

673 | pub struct Span { |

674 | /// The start offset of the span, inclusive. |

675 | pub start: usize, |

676 | /// The end offset of the span, exclusive. |

677 | pub end: usize, |

678 | } |

679 | |

680 | impl Span { |

681 | /// Returns this span as a range. |

682 | #[inline] |

683 | pub fn range(&self) -> Range<usize> { |

684 | Range::from(*self) |

685 | } |

686 | |

687 | /// Returns true when this span is empty. That is, when `start >= end`. |

688 | #[inline] |

689 | pub fn is_empty(&self) -> bool { |

690 | self.start >= self.end |

691 | } |

692 | |

693 | /// Returns the length of this span. |

694 | /// |

695 | /// This returns `0` in precisely the cases that `is_empty` returns `true`. |

696 | #[inline] |

697 | pub fn len(&self) -> usize { |

698 | self.end.saturating_sub(self.start) |

699 | } |

700 | |

701 | /// Returns true when the given offset is contained within this span. |

702 | /// |

703 | /// Note that an empty span contains no offsets and will always return |

704 | /// false. |

705 | #[inline] |

706 | pub fn contains(&self, offset: usize) -> bool { |

707 | !self.is_empty() && self.start <= offset && offset <= self.end |

708 | } |

709 | |

710 | /// Returns a new span with `offset` added to this span's `start` and `end` |

711 | /// values. |

712 | #[inline] |

713 | pub fn offset(&self, offset: usize) -> Span { |

714 | Span { start: self.start + offset, end: self.end + offset } |

715 | } |

716 | } |

717 | |

718 | impl core::fmt::Debug for Span { |

719 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |

720 | write!(f, "{}..{}", self.start, self.end) |

721 | } |

722 | } |

723 | |

724 | impl core::ops::Index<Span> for [u8] { |

725 | type Output = [u8]; |

726 | |

727 | #[inline] |

728 | fn index(&self, index: Span) -> &[u8] { |

729 | &self[index.range()] |

730 | } |

731 | } |

732 | |

733 | impl core::ops::IndexMut<Span> for [u8] { |

734 | #[inline] |

735 | fn index_mut(&mut self, index: Span) -> &mut [u8] { |

736 | &mut self[index.range()] |

737 | } |

738 | } |

739 | |

740 | impl core::ops::Index<Span> for str { |

741 | type Output = str; |

742 | |

743 | #[inline] |

744 | fn index(&self, index: Span) -> &str { |

745 | &self[index.range()] |

746 | } |

747 | } |

748 | |

749 | impl From<Range<usize>> for Span { |

750 | #[inline] |

751 | fn from(range: Range<usize>) -> Span { |

752 | Span { start: range.start, end: range.end } |

753 | } |

754 | } |

755 | |

756 | impl From<Span> for Range<usize> { |

757 | #[inline] |

758 | fn from(span: Span) -> Range<usize> { |

759 | Range { start: span.start, end: span.end } |

760 | } |

761 | } |

762 | |

763 | impl PartialEq<Range<usize>> for Span { |

764 | #[inline] |

765 | fn eq(&self, range: &Range<usize>) -> bool { |

766 | self.start == range.start && self.end == range.end |

767 | } |

768 | } |

769 | |

770 | impl PartialEq<Span> for Range<usize> { |

771 | #[inline] |

772 | fn eq(&self, span: &Span) -> bool { |

773 | self.start == span.start && self.end == span.end |

774 | } |

775 | } |

776 | |

777 | /// The type of anchored search to perform. |

778 | /// |

779 | /// If an Aho-Corasick searcher does not support the anchored mode selected, |

780 | /// then the search will return an error or panic, depending on whether a |

781 | /// fallible or an infallible routine was called. |

782 | #[non_exhaustive] |

783 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |

784 | pub enum Anchored { |

785 | /// Run an unanchored search. This means a match may occur anywhere at or |

786 | /// after the start position of the search up until the end position of the |

787 | /// search. |

788 | No, |

789 | /// Run an anchored search. This means that a match must begin at the start |

790 | /// position of the search and end before the end position of the search. |

791 | Yes, |

792 | } |

793 | |

794 | impl Anchored { |

795 | /// Returns true if and only if this anchor mode corresponds to an anchored |

796 | /// search. |

797 | /// |

798 | /// # Example |

799 | /// |

800 | /// ``` |

801 | /// use aho_corasick::Anchored; |

802 | /// |

803 | /// assert!(!Anchored::No.is_anchored()); |

804 | /// assert!(Anchored::Yes.is_anchored()); |

805 | /// ``` |

806 | #[inline] |

807 | pub fn is_anchored(&self) -> bool { |

808 | matches!(*self, Anchored::Yes) |

809 | } |

810 | } |

811 | |

812 | /// A representation of a match reported by an Aho-Corasick searcher. |

813 | /// |

814 | /// A match has two essential pieces of information: the [`PatternID`] that |

815 | /// matches, and the [`Span`] of the match in a haystack. |

816 | /// |

817 | /// The pattern is identified by an ID, which corresponds to its position |

818 | /// (starting from `0`) relative to other patterns used to construct the |

819 | /// corresponding searcher. If only a single pattern is provided, then all |

820 | /// matches are guaranteed to have a pattern ID of `0`. |

821 | /// |

822 | /// Every match reported by a searcher guarantees that its span has its start |

823 | /// offset as less than or equal to its end offset. |

824 | #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] |

825 | pub struct Match { |

826 | /// The pattern ID. |

827 | pattern: PatternID, |

828 | /// The underlying match span. |

829 | span: Span, |

830 | } |

831 | |

832 | impl Match { |

833 | /// Create a new match from a pattern ID and a span. |

834 | /// |

835 | /// This constructor is generic over how a span is provided. While |

836 | /// a [`Span`] may be given directly, one may also provide a |

837 | /// `std::ops::Range<usize>`. |

838 | /// |

839 | /// # Panics |

840 | /// |

841 | /// This panics if `end < start`. |

842 | /// |

843 | /// # Example |

844 | /// |

845 | /// This shows how to create a match for the first pattern in an |

846 | /// Aho-Corasick searcher using convenient range syntax. |

847 | /// |

848 | /// ``` |

849 | /// use aho_corasick::{Match, PatternID}; |

850 | /// |

851 | /// let m = Match::new(PatternID::ZERO, 5..10); |

852 | /// assert_eq!(0, m.pattern().as_usize()); |

853 | /// assert_eq!(5, m.start()); |

854 | /// assert_eq!(10, m.end()); |

855 | /// ``` |

856 | #[inline] |

857 | pub fn new<S: Into<Span>>(pattern: PatternID, span: S) -> Match { |

858 | let span = span.into(); |

859 | assert!(span.start <= span.end, "invalid match span"); |

860 | Match { pattern, span } |

861 | } |

862 | |

863 | /// Create a new match from a pattern ID and a byte offset span. |

864 | /// |

865 | /// This constructor is generic over how a span is provided. While |

866 | /// a [`Span`] may be given directly, one may also provide a |

867 | /// `std::ops::Range<usize>`. |

868 | /// |

869 | /// This is like [`Match::new`], but accepts a `usize` instead of a |

870 | /// [`PatternID`]. This panics if the given `usize` is not representable |

871 | /// as a `PatternID`. |

872 | /// |

873 | /// # Panics |

874 | /// |

875 | /// This panics if `end < start` or if `pattern > PatternID::MAX`. |

876 | /// |

877 | /// # Example |

878 | /// |

879 | /// This shows how to create a match for the third pattern in an |

880 | /// Aho-Corasick searcher using convenient range syntax. |

881 | /// |

882 | /// ``` |

883 | /// use aho_corasick::Match; |

884 | /// |

885 | /// let m = Match::must(3, 5..10); |

886 | /// assert_eq!(3, m.pattern().as_usize()); |

887 | /// assert_eq!(5, m.start()); |

888 | /// assert_eq!(10, m.end()); |

889 | /// ``` |

890 | #[inline] |

891 | pub fn must<S: Into<Span>>(pattern: usize, span: S) -> Match { |

892 | Match::new(PatternID::must(pattern), span) |

893 | } |

894 | |

895 | /// Returns the ID of the pattern that matched. |

896 | /// |

897 | /// The ID of a pattern is derived from the position in which it was |

898 | /// originally inserted into the corresponding searcher. The first pattern |

899 | /// has identifier `0`, and each subsequent pattern is `1`, `2` and so on. |

900 | #[inline] |

901 | pub fn pattern(&self) -> PatternID { |

902 | self.pattern |

903 | } |

904 | |

905 | /// The starting position of the match. |

906 | /// |

907 | /// This is a convenience routine for `Match::span().start`. |

908 | #[inline] |

909 | pub fn start(&self) -> usize { |

910 | self.span().start |

911 | } |

912 | |

913 | /// The ending position of the match. |

914 | /// |

915 | /// This is a convenience routine for `Match::span().end`. |

916 | #[inline] |

917 | pub fn end(&self) -> usize { |

918 | self.span().end |

919 | } |

920 | |

921 | /// Returns the match span as a range. |

922 | /// |

923 | /// This is a convenience routine for `Match::span().range()`. |

924 | #[inline] |

925 | pub fn range(&self) -> core::ops::Range<usize> { |

926 | self.span().range() |

927 | } |

928 | |

929 | /// Returns the span for this match. |

930 | #[inline] |

931 | pub fn span(&self) -> Span { |

932 | self.span |

933 | } |

934 | |

935 | /// Returns true when the span in this match is empty. |

936 | /// |

937 | /// An empty match can only be returned when empty pattern is in the |

938 | /// Aho-Corasick searcher. |

939 | #[inline] |

940 | pub fn is_empty(&self) -> bool { |

941 | self.span().is_empty() |

942 | } |

943 | |

944 | /// Returns the length of this match. |

945 | /// |

946 | /// This returns `0` in precisely the cases that `is_empty` returns `true`. |

947 | #[inline] |

948 | pub fn len(&self) -> usize { |

949 | self.span().len() |

950 | } |

951 | |

952 | /// Returns a new match with `offset` added to its span's `start` and `end` |

953 | /// values. |

954 | #[inline] |

955 | pub fn offset(&self, offset: usize) -> Match { |

956 | Match { |

957 | pattern: self.pattern, |

958 | span: Span { |

959 | start: self.start() + offset, |

960 | end: self.end() + offset, |

961 | }, |

962 | } |

963 | } |

964 | } |

965 | |

966 | /// A knob for controlling the match semantics of an Aho-Corasick automaton. |

967 | /// |

968 | /// There are two generally different ways that Aho-Corasick automatons can |

969 | /// report matches. The first way is the "standard" approach that results from |

970 | /// implementing most textbook explanations of Aho-Corasick. The second way is |

971 | /// to report only the leftmost non-overlapping matches. The leftmost approach |

972 | /// is in turn split into two different ways of resolving ambiguous matches: |

973 | /// leftmost-first and leftmost-longest. |

974 | /// |

975 | /// The `Standard` match kind is the default and is the only one that supports |

976 | /// overlapping matches and stream searching. (Trying to find overlapping or |

977 | /// streaming matches using leftmost match semantics will result in an error in |

978 | /// fallible APIs and a panic when using infallibe APIs.) The `Standard` match |

979 | /// kind will report matches as they are seen. When searching for overlapping |

980 | /// matches, then all possible matches are reported. When searching for |

981 | /// non-overlapping matches, the first match seen is reported. For example, for |

982 | /// non-overlapping matches, given the patterns `abcd` and `b` and the haystack |

983 | /// `abcdef`, only a match for `b` is reported since it is detected first. The |

984 | /// `abcd` match is never reported since it overlaps with the `b` match. |

985 | /// |

986 | /// In contrast, the leftmost match kind always prefers the leftmost match |

987 | /// among all possible matches. Given the same example as above with `abcd` and |

988 | /// `b` as patterns and `abcdef` as the haystack, the leftmost match is `abcd` |

989 | /// since it begins before the `b` match, even though the `b` match is detected |

990 | /// before the `abcd` match. In this case, the `b` match is not reported at all |

991 | /// since it overlaps with the `abcd` match. |

992 | /// |

993 | /// The difference between leftmost-first and leftmost-longest is in how they |

994 | /// resolve ambiguous matches when there are multiple leftmost matches to |

995 | /// choose from. Leftmost-first always chooses the pattern that was provided |

996 | /// earliest, where as leftmost-longest always chooses the longest matching |

997 | /// pattern. For example, given the patterns `a` and `ab` and the subject |

998 | /// string `ab`, the leftmost-first match is `a` but the leftmost-longest match |

999 | /// is `ab`. Conversely, if the patterns were given in reverse order, i.e., |

1000 | /// `ab` and `a`, then both the leftmost-first and leftmost-longest matches |

1001 | /// would be `ab`. Stated differently, the leftmost-first match depends on the |

1002 | /// order in which the patterns were given to the Aho-Corasick automaton. |

1003 | /// Because of that, when leftmost-first matching is used, if a pattern `A` |

1004 | /// that appears before a pattern `B` is a prefix of `B`, then it is impossible |

1005 | /// to ever observe a match of `B`. |

1006 | /// |

1007 | /// If you're not sure which match kind to pick, then stick with the standard |

1008 | /// kind, which is the default. In particular, if you need overlapping or |

1009 | /// streaming matches, then you _must_ use the standard kind. The leftmost |

1010 | /// kinds are useful in specific circumstances. For example, leftmost-first can |

1011 | /// be very useful as a way to implement match priority based on the order of |

1012 | /// patterns given and leftmost-longest can be useful for dictionary searching |

1013 | /// such that only the longest matching words are reported. |

1014 | /// |

1015 | /// # Relationship with regular expression alternations |

1016 | /// |

1017 | /// Understanding match semantics can be a little tricky, and one easy way |

1018 | /// to conceptualize non-overlapping matches from an Aho-Corasick automaton |

1019 | /// is to think about them as a simple alternation of literals in a regular |

1020 | /// expression. For example, let's say we wanted to match the strings |

1021 | /// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It |

1022 | /// turns out that regular expression engines have two different ways of |

1023 | /// matching this alternation. The first way, leftmost-longest, is commonly |

1024 | /// found in POSIX compatible implementations of regular expressions (such as |

1025 | /// `grep`). The second way, leftmost-first, is commonly found in backtracking |

1026 | /// implementations such as Perl. (Some regex engines, such as RE2 and Rust's |

1027 | /// regex engine do not use backtracking, but still implement leftmost-first |

1028 | /// semantics in an effort to match the behavior of dominant backtracking |

1029 | /// regex engines such as those found in Perl, Ruby, Python, Javascript and |

1030 | /// PHP.) |

1031 | /// |

1032 | /// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex |

1033 | /// will match `Samwise` because it is the longest possible match, but a |

1034 | /// Perl-like regex will match `Sam` since it appears earlier in the |

1035 | /// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine |

1036 | /// will never match `Samwise` since `Sam` will always have higher priority. |

1037 | /// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to |

1038 | /// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is |

1039 | /// still longest match, but it also appears earlier than `Sam`. |

1040 | /// |

1041 | /// The "standard" match semantics of Aho-Corasick generally don't correspond |

1042 | /// to the match semantics of any large group of regex implementations, so |

1043 | /// there's no direct analogy that can be made here. Standard match semantics |

1044 | /// are generally useful for overlapping matches, or if you just want to see |

1045 | /// matches as they are detected. |

1046 | /// |

1047 | /// The main conclusion to draw from this section is that the match semantics |

1048 | /// can be tweaked to precisely match either Perl-like regex alternations or |

1049 | /// POSIX regex alternations. |

1050 | #[non_exhaustive] |

1051 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |

1052 | pub enum MatchKind { |

1053 | /// Use standard match semantics, which support overlapping matches. When |

1054 | /// used with non-overlapping matches, matches are reported as they are |

1055 | /// seen. |

1056 | Standard, |

1057 | /// Use leftmost-first match semantics, which reports leftmost matches. |

1058 | /// When there are multiple possible leftmost matches, the match |

1059 | /// corresponding to the pattern that appeared earlier when constructing |

1060 | /// the automaton is reported. |

1061 | /// |

1062 | /// This does **not** support overlapping matches or stream searching. If |

1063 | /// this match kind is used, attempting to find overlapping matches or |

1064 | /// stream matches will fail. |

1065 | LeftmostFirst, |

1066 | /// Use leftmost-longest match semantics, which reports leftmost matches. |

1067 | /// When there are multiple possible leftmost matches, the longest match |

1068 | /// is chosen. |

1069 | /// |

1070 | /// This does **not** support overlapping matches or stream searching. If |

1071 | /// this match kind is used, attempting to find overlapping matches or |

1072 | /// stream matches will fail. |

1073 | LeftmostLongest, |

1074 | } |

1075 | |

1076 | /// The default match kind is `MatchKind::Standard`. |

1077 | impl Default for MatchKind { |

1078 | fn default() -> MatchKind { |

1079 | MatchKind::Standard |

1080 | } |

1081 | } |

1082 | |

1083 | impl MatchKind { |

1084 | #[inline] |

1085 | pub(crate) fn is_standard(&self) -> bool { |

1086 | matches!(*self, MatchKind::Standard) |

1087 | } |

1088 | |

1089 | #[inline] |

1090 | pub(crate) fn is_leftmost(&self) -> bool { |

1091 | matches!(*self, MatchKind::LeftmostFirst | MatchKind::LeftmostLongest) |

1092 | } |

1093 | |

1094 | #[inline] |

1095 | pub(crate) fn is_leftmost_first(&self) -> bool { |

1096 | matches!(*self, MatchKind::LeftmostFirst) |

1097 | } |

1098 | |

1099 | /// Convert this match kind into a packed match kind. If this match kind |

1100 | /// corresponds to standard semantics, then this returns None, since |

1101 | /// packed searching does not support standard semantics. |

1102 | #[inline] |

1103 | pub(crate) fn as_packed(&self) -> Option<crate::packed::MatchKind> { |

1104 | match *self { |

1105 | MatchKind::Standard => None, |

1106 | MatchKind::LeftmostFirst => { |

1107 | Some(crate::packed::MatchKind::LeftmostFirst) |

1108 | } |

1109 | MatchKind::LeftmostLongest => { |

1110 | Some(crate::packed::MatchKind::LeftmostLongest) |

1111 | } |

1112 | } |

1113 | } |

1114 | } |

1115 | |

1116 | /// The kind of anchored starting configurations to support in an Aho-Corasick |

1117 | /// searcher. |

1118 | /// |

1119 | /// Depending on which searcher is used internally by |

1120 | /// [`AhoCorasick`](crate::AhoCorasick), supporting both unanchored |

1121 | /// and anchored searches can be quite costly. For this reason, |

1122 | /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind) |

1123 | /// can be used to configure whether your searcher supports unanchored, |

1124 | /// anchored or both kinds of searches. |

1125 | /// |

1126 | /// This searcher configuration knob works in concert with the search time |

1127 | /// configuration [`Input::anchored`]. Namely, if one requests an unsupported |

1128 | /// anchored mode, then the search will either panic or return an error, |

1129 | /// depending on whether you're using infallible or fallibe APIs, respectively. |

1130 | /// |

1131 | /// `AhoCorasick` by default only supports unanchored searches. |

1132 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |

1133 | pub enum StartKind { |

1134 | /// Support both anchored and unanchored searches. |

1135 | Both, |

1136 | /// Support only unanchored searches. Requesting an anchored search will |

1137 | /// return an error in fallible APIs and panic in infallible APIs. |

1138 | Unanchored, |

1139 | /// Support only anchored searches. Requesting an unanchored search will |

1140 | /// return an error in fallible APIs and panic in infallible APIs. |

1141 | Anchored, |

1142 | } |

1143 | |

1144 | impl Default for StartKind { |

1145 | fn default() -> StartKind { |

1146 | StartKind::Unanchored |

1147 | } |

1148 | } |

1149 |