1 | use std::io; |
2 | |
3 | use crate::automaton::Automaton; |
4 | use crate::buffer::Buffer; |
5 | use crate::dfa::{self, DFA}; |
6 | use crate::error::Result; |
7 | use crate::nfa::{self, NFA}; |
8 | use crate::packed; |
9 | use crate::prefilter::{Prefilter, PrefilterState}; |
10 | use crate::state_id::StateID; |
11 | use crate::Match; |
12 | |
13 | /// An automaton for searching multiple strings in linear time. |
14 | /// |
15 | /// The `AhoCorasick` type supports a few basic ways of constructing an |
16 | /// automaton, including |
17 | /// [`AhoCorasick::new`](struct.AhoCorasick.html#method.new) |
18 | /// and |
19 | /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured). |
20 | /// However, there are a fair number of configurable options that can be set |
21 | /// by using |
22 | /// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html) |
23 | /// instead. Such options include, but are not limited to, how matches are |
24 | /// determined, simple case insensitivity, whether to use a DFA or not and |
25 | /// various knobs for controlling the space-vs-time trade offs taken when |
26 | /// building the automaton. |
27 | /// |
28 | /// If you aren't sure where to start, try beginning with |
29 | /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured). |
30 | /// |
31 | /// # Resource usage |
32 | /// |
33 | /// Aho-Corasick automatons are always constructed in `O(p)` time, where `p` |
34 | /// is the combined length of all patterns being searched. With that said, |
35 | /// building an automaton can be fairly costly because of high constant |
36 | /// factors, particularly when enabling the |
37 | /// [DFA](struct.AhoCorasickBuilder.html#method.dfa) |
38 | /// option (which is disabled by default). For this reason, it's generally a |
39 | /// good idea to build an automaton once and reuse it as much as possible. |
40 | /// |
41 | /// Aho-Corasick automatons can also use a fair bit of memory. To get a |
42 | /// concrete idea of how much memory is being used, try using the |
43 | /// [`AhoCorasick::heap_bytes`](struct.AhoCorasick.html#method.heap_bytes) |
44 | /// method. |
45 | /// |
46 | /// # Examples |
47 | /// |
48 | /// This example shows how to search for occurrences of multiple patterns |
49 | /// simultaneously in a case insensitive fashion. Each match includes the |
50 | /// pattern that matched along with the byte offsets of the match. |
51 | /// |
52 | /// ``` |
53 | /// use aho_corasick::AhoCorasickBuilder; |
54 | /// |
55 | /// let patterns = &["apple" , "maple" , "snapple" ]; |
56 | /// let haystack = "Nobody likes maple in their apple flavored Snapple." ; |
57 | /// |
58 | /// let ac = AhoCorasickBuilder::new() |
59 | /// .ascii_case_insensitive(true) |
60 | /// .build(patterns); |
61 | /// let mut matches = vec![]; |
62 | /// for mat in ac.find_iter(haystack) { |
63 | /// matches.push((mat.pattern(), mat.start(), mat.end())); |
64 | /// } |
65 | /// assert_eq!(matches, vec![ |
66 | /// (1, 13, 18), |
67 | /// (0, 28, 33), |
68 | /// (2, 43, 50), |
69 | /// ]); |
70 | /// ``` |
71 | /// |
72 | /// This example shows how to replace matches with some other string: |
73 | /// |
74 | /// ``` |
75 | /// use aho_corasick::AhoCorasick; |
76 | /// |
77 | /// let patterns = &["fox" , "brown" , "quick" ]; |
78 | /// let haystack = "The quick brown fox." ; |
79 | /// let replace_with = &["sloth" , "grey" , "slow" ]; |
80 | /// |
81 | /// let ac = AhoCorasick::new(patterns); |
82 | /// let result = ac.replace_all(haystack, replace_with); |
83 | /// assert_eq!(result, "The slow grey sloth." ); |
84 | /// ``` |
85 | #[derive (Clone, Debug)] |
86 | pub struct AhoCorasick<S: StateID = usize> { |
87 | imp: Imp<S>, |
88 | match_kind: MatchKind, |
89 | } |
90 | |
91 | impl AhoCorasick { |
92 | /// Create a new Aho-Corasick automaton using the default configuration. |
93 | /// |
94 | /// The default configuration optimizes for less space usage, but at the |
95 | /// expense of longer search times. To change the configuration, use |
96 | /// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html) |
97 | /// for fine-grained control, or |
98 | /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured) |
99 | /// for automatic configuration if you aren't sure which settings to pick. |
100 | /// |
101 | /// This uses the default |
102 | /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard) |
103 | /// match semantics, which reports a match as soon as it is found. This |
104 | /// corresponds to the standard match semantics supported by textbook |
105 | /// descriptions of the Aho-Corasick algorithm. |
106 | /// |
107 | /// # Examples |
108 | /// |
109 | /// Basic usage: |
110 | /// |
111 | /// ``` |
112 | /// use aho_corasick::AhoCorasick; |
113 | /// |
114 | /// let ac = AhoCorasick::new(&[ |
115 | /// "foo" , "bar" , "baz" , |
116 | /// ]); |
117 | /// assert_eq!(Some(1), ac.find("xxx bar xxx" ).map(|m| m.pattern())); |
118 | /// ``` |
119 | pub fn new<I, P>(patterns: I) -> AhoCorasick |
120 | where |
121 | I: IntoIterator<Item = P>, |
122 | P: AsRef<[u8]>, |
123 | { |
124 | AhoCorasickBuilder::new().build(patterns) |
125 | } |
126 | |
127 | /// Build an Aho-Corasick automaton with an automatically determined |
128 | /// configuration. |
129 | /// |
130 | /// Specifically, this requires a slice of patterns instead of an iterator |
131 | /// since the configuration is determined by looking at the patterns before |
132 | /// constructing the automaton. The idea here is to balance space and time |
133 | /// automatically. That is, when searching a small number of patterns, this |
134 | /// will attempt to use the fastest possible configuration since the total |
135 | /// space required will be small anyway. As the number of patterns grows, |
136 | /// this will fall back to slower configurations that use less space. |
137 | /// |
138 | /// If you want auto configuration but with match semantics different from |
139 | /// the default `MatchKind::Standard`, then use |
140 | /// [`AhoCorasickBuilder::auto_configure`](struct.AhoCorasickBuilder.html#method.auto_configure). |
141 | /// |
142 | /// # Examples |
143 | /// |
144 | /// Basic usage is just like `new`, except you must provide a slice: |
145 | /// |
146 | /// ``` |
147 | /// use aho_corasick::AhoCorasick; |
148 | /// |
149 | /// let ac = AhoCorasick::new_auto_configured(&[ |
150 | /// "foo" , "bar" , "baz" , |
151 | /// ]); |
152 | /// assert_eq!(Some(1), ac.find("xxx bar xxx" ).map(|m| m.pattern())); |
153 | /// ``` |
154 | pub fn new_auto_configured<B>(patterns: &[B]) -> AhoCorasick |
155 | where |
156 | B: AsRef<[u8]>, |
157 | { |
158 | AhoCorasickBuilder::new().auto_configure(patterns).build(patterns) |
159 | } |
160 | } |
161 | |
162 | impl<S: StateID> AhoCorasick<S> { |
163 | /// Returns true if and only if this automaton matches the haystack at any |
164 | /// position. |
165 | /// |
166 | /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. |
167 | /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and |
168 | /// `&[u8]` itself. |
169 | /// |
170 | /// # Examples |
171 | /// |
172 | /// Basic usage: |
173 | /// |
174 | /// ``` |
175 | /// use aho_corasick::AhoCorasick; |
176 | /// |
177 | /// let ac = AhoCorasick::new(&[ |
178 | /// "foo" , "bar" , "quux" , "baz" , |
179 | /// ]); |
180 | /// assert!(ac.is_match("xxx bar xxx" )); |
181 | /// assert!(!ac.is_match("xxx qux xxx" )); |
182 | /// ``` |
183 | pub fn is_match<B: AsRef<[u8]>>(&self, haystack: B) -> bool { |
184 | self.earliest_find(haystack).is_some() |
185 | } |
186 | |
187 | /// Returns the location of the first detected match in `haystack`. |
188 | /// |
189 | /// This method has the same behavior regardless of the |
190 | /// [`MatchKind`](enum.MatchKind.html) |
191 | /// of this automaton. |
192 | /// |
193 | /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. |
194 | /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and |
195 | /// `&[u8]` itself. |
196 | /// |
197 | /// # Examples |
198 | /// |
199 | /// Basic usage: |
200 | /// |
201 | /// ``` |
202 | /// use aho_corasick::AhoCorasick; |
203 | /// |
204 | /// let ac = AhoCorasick::new(&[ |
205 | /// "abc" , "b" , |
206 | /// ]); |
207 | /// let mat = ac.earliest_find("abcd" ).expect("should have match" ); |
208 | /// assert_eq!(1, mat.pattern()); |
209 | /// assert_eq!((1, 2), (mat.start(), mat.end())); |
210 | /// ``` |
211 | pub fn earliest_find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> { |
212 | let mut prestate = PrefilterState::new(self.max_pattern_len()); |
213 | let mut start = self.imp.start_state(); |
214 | self.imp.earliest_find_at( |
215 | &mut prestate, |
216 | haystack.as_ref(), |
217 | 0, |
218 | &mut start, |
219 | ) |
220 | } |
221 | |
222 | /// Returns the location of the first match according to the match |
223 | /// semantics that this automaton was constructed with. |
224 | /// |
225 | /// When using `MatchKind::Standard`, this corresponds precisely to the |
226 | /// same behavior as |
227 | /// [`earliest_find`](struct.AhoCorasick.html#method.earliest_find). |
228 | /// Otherwise, match semantics correspond to either |
229 | /// [leftmost-first](enum.MatchKind.html#variant.LeftmostFirst) |
230 | /// or |
231 | /// [leftmost-longest](enum.MatchKind.html#variant.LeftmostLongest). |
232 | /// |
233 | /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. |
234 | /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and |
235 | /// `&[u8]` itself. |
236 | /// |
237 | /// # Examples |
238 | /// |
239 | /// Basic usage, with standard semantics: |
240 | /// |
241 | /// ``` |
242 | /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; |
243 | /// |
244 | /// let patterns = &["b" , "abc" , "abcd" ]; |
245 | /// let haystack = "abcd" ; |
246 | /// |
247 | /// let ac = AhoCorasickBuilder::new() |
248 | /// .match_kind(MatchKind::Standard) // default, not necessary |
249 | /// .build(patterns); |
250 | /// let mat = ac.find(haystack).expect("should have a match" ); |
251 | /// assert_eq!("b" , &haystack[mat.start()..mat.end()]); |
252 | /// ``` |
253 | /// |
254 | /// Now with leftmost-first semantics: |
255 | /// |
256 | /// ``` |
257 | /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; |
258 | /// |
259 | /// let patterns = &["b" , "abc" , "abcd" ]; |
260 | /// let haystack = "abcd" ; |
261 | /// |
262 | /// let ac = AhoCorasickBuilder::new() |
263 | /// .match_kind(MatchKind::LeftmostFirst) |
264 | /// .build(patterns); |
265 | /// let mat = ac.find(haystack).expect("should have a match" ); |
266 | /// assert_eq!("abc" , &haystack[mat.start()..mat.end()]); |
267 | /// ``` |
268 | /// |
269 | /// And finally, leftmost-longest semantics: |
270 | /// |
271 | /// ``` |
272 | /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; |
273 | /// |
274 | /// let patterns = &["b" , "abc" , "abcd" ]; |
275 | /// let haystack = "abcd" ; |
276 | /// |
277 | /// let ac = AhoCorasickBuilder::new() |
278 | /// .match_kind(MatchKind::LeftmostLongest) |
279 | /// .build(patterns); |
280 | /// let mat = ac.find(haystack).expect("should have a match" ); |
281 | /// assert_eq!("abcd" , &haystack[mat.start()..mat.end()]); |
282 | /// ``` |
283 | pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> { |
284 | let mut prestate = PrefilterState::new(self.max_pattern_len()); |
285 | self.imp.find_at_no_state(&mut prestate, haystack.as_ref(), 0) |
286 | } |
287 | |
288 | /// Returns an iterator of non-overlapping matches, using the match |
289 | /// semantics that this automaton was constructed with. |
290 | /// |
291 | /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. |
292 | /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and |
293 | /// `&[u8]` itself. |
294 | /// |
295 | /// # Examples |
296 | /// |
297 | /// Basic usage, with standard semantics: |
298 | /// |
299 | /// ``` |
300 | /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; |
301 | /// |
302 | /// let patterns = &["append" , "appendage" , "app" ]; |
303 | /// let haystack = "append the app to the appendage" ; |
304 | /// |
305 | /// let ac = AhoCorasickBuilder::new() |
306 | /// .match_kind(MatchKind::Standard) // default, not necessary |
307 | /// .build(patterns); |
308 | /// let matches: Vec<usize> = ac |
309 | /// .find_iter(haystack) |
310 | /// .map(|mat| mat.pattern()) |
311 | /// .collect(); |
312 | /// assert_eq!(vec![2, 2, 2], matches); |
313 | /// ``` |
314 | /// |
315 | /// Now with leftmost-first semantics: |
316 | /// |
317 | /// ``` |
318 | /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; |
319 | /// |
320 | /// let patterns = &["append" , "appendage" , "app" ]; |
321 | /// let haystack = "append the app to the appendage" ; |
322 | /// |
323 | /// let ac = AhoCorasickBuilder::new() |
324 | /// .match_kind(MatchKind::LeftmostFirst) |
325 | /// .build(patterns); |
326 | /// let matches: Vec<usize> = ac |
327 | /// .find_iter(haystack) |
328 | /// .map(|mat| mat.pattern()) |
329 | /// .collect(); |
330 | /// assert_eq!(vec![0, 2, 0], matches); |
331 | /// ``` |
332 | /// |
333 | /// And finally, leftmost-longest semantics: |
334 | /// |
335 | /// ``` |
336 | /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; |
337 | /// |
338 | /// let patterns = &["append" , "appendage" , "app" ]; |
339 | /// let haystack = "append the app to the appendage" ; |
340 | /// |
341 | /// let ac = AhoCorasickBuilder::new() |
342 | /// .match_kind(MatchKind::LeftmostLongest) |
343 | /// .build(patterns); |
344 | /// let matches: Vec<usize> = ac |
345 | /// .find_iter(haystack) |
346 | /// .map(|mat| mat.pattern()) |
347 | /// .collect(); |
348 | /// assert_eq!(vec![0, 2, 1], matches); |
349 | /// ``` |
350 | pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( |
351 | &'a self, |
352 | haystack: &'b B, |
353 | ) -> FindIter<'a, 'b, S> { |
354 | FindIter::new(self, haystack.as_ref()) |
355 | } |
356 | |
357 | /// Returns an iterator of overlapping matches in the given `haystack`. |
358 | /// |
359 | /// Overlapping matches can _only_ be detected using |
360 | /// `MatchKind::Standard` semantics. If this automaton was constructed with |
361 | /// leftmost semantics, then this method will panic. To determine whether |
362 | /// this will panic at runtime, use the |
363 | /// [`AhoCorasick::supports_overlapping`](struct.AhoCorasick.html#method.supports_overlapping) |
364 | /// method. |
365 | /// |
366 | /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. |
367 | /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and |
368 | /// `&[u8]` itself. |
369 | /// |
370 | /// # Panics |
371 | /// |
372 | /// This panics when `AhoCorasick::supports_overlapping` returns `false`. |
373 | /// That is, this panics when this automaton's match semantics are not |
374 | /// `MatchKind::Standard`. |
375 | /// |
376 | /// # Examples |
377 | /// |
378 | /// Basic usage, with standard semantics: |
379 | /// |
380 | /// ``` |
381 | /// use aho_corasick::AhoCorasick; |
382 | /// |
383 | /// let patterns = &["append" , "appendage" , "app" ]; |
384 | /// let haystack = "append the app to the appendage" ; |
385 | /// |
386 | /// let ac = AhoCorasick::new(patterns); |
387 | /// let matches: Vec<usize> = ac |
388 | /// .find_overlapping_iter(haystack) |
389 | /// .map(|mat| mat.pattern()) |
390 | /// .collect(); |
391 | /// assert_eq!(vec![2, 0, 2, 2, 0, 1], matches); |
392 | /// ``` |
393 | pub fn find_overlapping_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( |
394 | &'a self, |
395 | haystack: &'b B, |
396 | ) -> FindOverlappingIter<'a, 'b, S> { |
397 | FindOverlappingIter::new(self, haystack.as_ref()) |
398 | } |
399 | |
400 | /// Replace all matches with a corresponding value in the `replace_with` |
401 | /// slice given. Matches correspond to the same matches as reported by |
402 | /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). |
403 | /// |
404 | /// Replacements are determined by the index of the matching pattern. |
405 | /// For example, if the pattern with index `2` is found, then it is |
406 | /// replaced by `replace_with[2]`. |
407 | /// |
408 | /// # Panics |
409 | /// |
410 | /// This panics when `replace_with.len()` does not equal the total number |
411 | /// of patterns that are matched by this automaton. |
412 | /// |
413 | /// # Examples |
414 | /// |
415 | /// Basic usage: |
416 | /// |
417 | /// ``` |
418 | /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; |
419 | /// |
420 | /// let patterns = &["append" , "appendage" , "app" ]; |
421 | /// let haystack = "append the app to the appendage" ; |
422 | /// |
423 | /// let ac = AhoCorasickBuilder::new() |
424 | /// .match_kind(MatchKind::LeftmostFirst) |
425 | /// .build(patterns); |
426 | /// let result = ac.replace_all(haystack, &["x" , "y" , "z" ]); |
427 | /// assert_eq!("x the z to the xage" , result); |
428 | /// ``` |
429 | pub fn replace_all<B>(&self, haystack: &str, replace_with: &[B]) -> String |
430 | where |
431 | B: AsRef<str>, |
432 | { |
433 | assert_eq!( |
434 | replace_with.len(), |
435 | self.pattern_count(), |
436 | "replace_all requires a replacement for every pattern \ |
437 | in the automaton" |
438 | ); |
439 | let mut dst = String::with_capacity(haystack.len()); |
440 | self.replace_all_with(haystack, &mut dst, |mat, _, dst| { |
441 | dst.push_str(replace_with[mat.pattern()].as_ref()); |
442 | true |
443 | }); |
444 | dst |
445 | } |
446 | |
447 | /// Replace all matches using raw bytes with a corresponding value in the |
448 | /// `replace_with` slice given. Matches correspond to the same matches as |
449 | /// reported by [`find_iter`](struct.AhoCorasick.html#method.find_iter). |
450 | /// |
451 | /// Replacements are determined by the index of the matching pattern. |
452 | /// For example, if the pattern with index `2` is found, then it is |
453 | /// replaced by `replace_with[2]`. |
454 | /// |
455 | /// # Panics |
456 | /// |
457 | /// This panics when `replace_with.len()` does not equal the total number |
458 | /// of patterns that are matched by this automaton. |
459 | /// |
460 | /// # Examples |
461 | /// |
462 | /// Basic usage: |
463 | /// |
464 | /// ``` |
465 | /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; |
466 | /// |
467 | /// let patterns = &["append" , "appendage" , "app" ]; |
468 | /// let haystack = b"append the app to the appendage" ; |
469 | /// |
470 | /// let ac = AhoCorasickBuilder::new() |
471 | /// .match_kind(MatchKind::LeftmostFirst) |
472 | /// .build(patterns); |
473 | /// let result = ac.replace_all_bytes(haystack, &["x" , "y" , "z" ]); |
474 | /// assert_eq!(b"x the z to the xage" .to_vec(), result); |
475 | /// ``` |
476 | pub fn replace_all_bytes<B>( |
477 | &self, |
478 | haystack: &[u8], |
479 | replace_with: &[B], |
480 | ) -> Vec<u8> |
481 | where |
482 | B: AsRef<[u8]>, |
483 | { |
484 | assert_eq!( |
485 | replace_with.len(), |
486 | self.pattern_count(), |
487 | "replace_all_bytes requires a replacement for every pattern \ |
488 | in the automaton" |
489 | ); |
490 | let mut dst = Vec::with_capacity(haystack.len()); |
491 | self.replace_all_with_bytes(haystack, &mut dst, |mat, _, dst| { |
492 | dst.extend(replace_with[mat.pattern()].as_ref()); |
493 | true |
494 | }); |
495 | dst |
496 | } |
497 | |
498 | /// Replace all matches using a closure called on each match. |
499 | /// Matches correspond to the same matches as reported by |
500 | /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). |
501 | /// |
502 | /// The closure accepts three parameters: the match found, the text of |
503 | /// the match and a string buffer with which to write the replaced text |
504 | /// (if any). If the closure returns `true`, then it continues to the next |
505 | /// match. If the closure returns `false`, then searching is stopped. |
506 | /// |
507 | /// # Examples |
508 | /// |
509 | /// Basic usage: |
510 | /// |
511 | /// ``` |
512 | /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; |
513 | /// |
514 | /// let patterns = &["append" , "appendage" , "app" ]; |
515 | /// let haystack = "append the app to the appendage" ; |
516 | /// |
517 | /// let ac = AhoCorasickBuilder::new() |
518 | /// .match_kind(MatchKind::LeftmostFirst) |
519 | /// .build(patterns); |
520 | /// let mut result = String::new(); |
521 | /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| { |
522 | /// dst.push_str(&mat.pattern().to_string()); |
523 | /// true |
524 | /// }); |
525 | /// assert_eq!("0 the 2 to the 0age" , result); |
526 | /// ``` |
527 | /// |
528 | /// Stopping the replacement by returning `false` (continued from the |
529 | /// example above): |
530 | /// |
531 | /// ``` |
532 | /// # use aho_corasick::{AhoCorasickBuilder, MatchKind}; |
533 | /// # let patterns = &["append" , "appendage" , "app" ]; |
534 | /// # let haystack = "append the app to the appendage" ; |
535 | /// # let ac = AhoCorasickBuilder::new() |
536 | /// # .match_kind(MatchKind::LeftmostFirst) |
537 | /// # .build(patterns); |
538 | /// let mut result = String::new(); |
539 | /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| { |
540 | /// dst.push_str(&mat.pattern().to_string()); |
541 | /// mat.pattern() != 2 |
542 | /// }); |
543 | /// assert_eq!("0 the 2 to the appendage" , result); |
544 | /// ``` |
545 | pub fn replace_all_with<F>( |
546 | &self, |
547 | haystack: &str, |
548 | dst: &mut String, |
549 | mut replace_with: F, |
550 | ) where |
551 | F: FnMut(&Match, &str, &mut String) -> bool, |
552 | { |
553 | let mut last_match = 0; |
554 | for mat in self.find_iter(haystack) { |
555 | dst.push_str(&haystack[last_match..mat.start()]); |
556 | last_match = mat.end(); |
557 | if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) { |
558 | break; |
559 | }; |
560 | } |
561 | dst.push_str(&haystack[last_match..]); |
562 | } |
563 | |
564 | /// Replace all matches using raw bytes with a closure called on each |
565 | /// match. Matches correspond to the same matches as reported by |
566 | /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). |
567 | /// |
568 | /// The closure accepts three parameters: the match found, the text of |
569 | /// the match and a byte buffer with which to write the replaced text |
570 | /// (if any). If the closure returns `true`, then it continues to the next |
571 | /// match. If the closure returns `false`, then searching is stopped. |
572 | /// |
573 | /// # Examples |
574 | /// |
575 | /// Basic usage: |
576 | /// |
577 | /// ``` |
578 | /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; |
579 | /// |
580 | /// let patterns = &["append" , "appendage" , "app" ]; |
581 | /// let haystack = b"append the app to the appendage" ; |
582 | /// |
583 | /// let ac = AhoCorasickBuilder::new() |
584 | /// .match_kind(MatchKind::LeftmostFirst) |
585 | /// .build(patterns); |
586 | /// let mut result = vec![]; |
587 | /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| { |
588 | /// dst.extend(mat.pattern().to_string().bytes()); |
589 | /// true |
590 | /// }); |
591 | /// assert_eq!(b"0 the 2 to the 0age" .to_vec(), result); |
592 | /// ``` |
593 | /// |
594 | /// Stopping the replacement by returning `false` (continued from the |
595 | /// example above): |
596 | /// |
597 | /// ``` |
598 | /// # use aho_corasick::{AhoCorasickBuilder, MatchKind}; |
599 | /// # let patterns = &["append" , "appendage" , "app" ]; |
600 | /// # let haystack = b"append the app to the appendage" ; |
601 | /// # let ac = AhoCorasickBuilder::new() |
602 | /// # .match_kind(MatchKind::LeftmostFirst) |
603 | /// # .build(patterns); |
604 | /// let mut result = vec![]; |
605 | /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| { |
606 | /// dst.extend(mat.pattern().to_string().bytes()); |
607 | /// mat.pattern() != 2 |
608 | /// }); |
609 | /// assert_eq!(b"0 the 2 to the appendage" .to_vec(), result); |
610 | /// ``` |
611 | pub fn replace_all_with_bytes<F>( |
612 | &self, |
613 | haystack: &[u8], |
614 | dst: &mut Vec<u8>, |
615 | mut replace_with: F, |
616 | ) where |
617 | F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool, |
618 | { |
619 | let mut last_match = 0; |
620 | for mat in self.find_iter(haystack) { |
621 | dst.extend(&haystack[last_match..mat.start()]); |
622 | last_match = mat.end(); |
623 | if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) { |
624 | break; |
625 | }; |
626 | } |
627 | dst.extend(&haystack[last_match..]); |
628 | } |
629 | |
630 | /// Returns an iterator of non-overlapping matches in the given |
631 | /// stream. Matches correspond to the same matches as reported by |
632 | /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). |
633 | /// |
634 | /// The matches yielded by this iterator use absolute position offsets in |
635 | /// the stream given, where the first byte has index `0`. Matches are |
636 | /// yieled until the stream is exhausted. |
637 | /// |
638 | /// Each item yielded by the iterator is an `io::Result<Match>`, where an |
639 | /// error is yielded if there was a problem reading from the reader given. |
640 | /// |
641 | /// When searching a stream, an internal buffer is used. Therefore, callers |
642 | /// should avoiding providing a buffered reader, if possible. |
643 | /// |
644 | /// Searching a stream requires that the automaton was built with |
645 | /// `MatchKind::Standard` semantics. If this automaton was constructed |
646 | /// with leftmost semantics, then this method will panic. To determine |
647 | /// whether this will panic at runtime, use the |
648 | /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream) |
649 | /// method. |
650 | /// |
651 | /// # Memory usage |
652 | /// |
653 | /// In general, searching streams will use a constant amount of memory for |
654 | /// its internal buffer. The one requirement is that the internal buffer |
655 | /// must be at least the size of the longest possible match. In most use |
656 | /// cases, the default buffer size will be much larger than any individual |
657 | /// match. |
658 | /// |
659 | /// # Panics |
660 | /// |
661 | /// This panics when `AhoCorasick::supports_stream` returns `false`. |
662 | /// That is, this panics when this automaton's match semantics are not |
663 | /// `MatchKind::Standard`. This restriction may be lifted in the future. |
664 | /// |
665 | /// # Examples |
666 | /// |
667 | /// Basic usage: |
668 | /// |
669 | /// ``` |
670 | /// use aho_corasick::AhoCorasick; |
671 | /// |
672 | /// # fn example() -> Result<(), ::std::io::Error> { |
673 | /// let patterns = &["append" , "appendage" , "app" ]; |
674 | /// let haystack = "append the app to the appendage" ; |
675 | /// |
676 | /// let ac = AhoCorasick::new(patterns); |
677 | /// let mut matches = vec![]; |
678 | /// for result in ac.stream_find_iter(haystack.as_bytes()) { |
679 | /// let mat = result?; |
680 | /// matches.push(mat.pattern()); |
681 | /// } |
682 | /// assert_eq!(vec![2, 2, 2], matches); |
683 | /// # Ok(()) }; example().unwrap() |
684 | /// ``` |
685 | pub fn stream_find_iter<'a, R: io::Read>( |
686 | &'a self, |
687 | rdr: R, |
688 | ) -> StreamFindIter<'a, R, S> { |
689 | StreamFindIter::new(self, rdr) |
690 | } |
691 | |
692 | /// Search for and replace all matches of this automaton in |
693 | /// the given reader, and write the replacements to the given |
694 | /// writer. Matches correspond to the same matches as reported by |
695 | /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). |
696 | /// |
697 | /// Replacements are determined by the index of the matching pattern. |
698 | /// For example, if the pattern with index `2` is found, then it is |
699 | /// replaced by `replace_with[2]`. |
700 | /// |
701 | /// After all matches are replaced, the writer is _not_ flushed. |
702 | /// |
703 | /// If there was a problem reading from the given reader or writing to the |
704 | /// given writer, then the corresponding `io::Error` is returned and all |
705 | /// replacement is stopped. |
706 | /// |
707 | /// When searching a stream, an internal buffer is used. Therefore, callers |
708 | /// should avoiding providing a buffered reader, if possible. However, |
709 | /// callers may want to provide a buffered writer. |
710 | /// |
711 | /// Searching a stream requires that the automaton was built with |
712 | /// `MatchKind::Standard` semantics. If this automaton was constructed |
713 | /// with leftmost semantics, then this method will panic. To determine |
714 | /// whether this will panic at runtime, use the |
715 | /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream) |
716 | /// method. |
717 | /// |
718 | /// # Memory usage |
719 | /// |
720 | /// In general, searching streams will use a constant amount of memory for |
721 | /// its internal buffer. The one requirement is that the internal buffer |
722 | /// must be at least the size of the longest possible match. In most use |
723 | /// cases, the default buffer size will be much larger than any individual |
724 | /// match. |
725 | /// |
726 | /// # Panics |
727 | /// |
728 | /// This panics when `AhoCorasick::supports_stream` returns `false`. |
729 | /// That is, this panics when this automaton's match semantics are not |
730 | /// `MatchKind::Standard`. This restriction may be lifted in the future. |
731 | /// |
732 | /// # Examples |
733 | /// |
734 | /// Basic usage: |
735 | /// |
736 | /// ``` |
737 | /// use aho_corasick::AhoCorasick; |
738 | /// |
739 | /// # fn example() -> Result<(), ::std::io::Error> { |
740 | /// let patterns = &["fox" , "brown" , "quick" ]; |
741 | /// let haystack = "The quick brown fox." ; |
742 | /// let replace_with = &["sloth" , "grey" , "slow" ]; |
743 | /// |
744 | /// let ac = AhoCorasick::new(patterns); |
745 | /// let mut result = vec![]; |
746 | /// ac.stream_replace_all(haystack.as_bytes(), &mut result, replace_with)?; |
747 | /// assert_eq!(b"The slow grey sloth." .to_vec(), result); |
748 | /// # Ok(()) }; example().unwrap() |
749 | /// ``` |
750 | pub fn stream_replace_all<R, W, B>( |
751 | &self, |
752 | rdr: R, |
753 | wtr: W, |
754 | replace_with: &[B], |
755 | ) -> io::Result<()> |
756 | where |
757 | R: io::Read, |
758 | W: io::Write, |
759 | B: AsRef<[u8]>, |
760 | { |
761 | assert_eq!( |
762 | replace_with.len(), |
763 | self.pattern_count(), |
764 | "stream_replace_all requires a replacement for every pattern \ |
765 | in the automaton" |
766 | ); |
767 | self.stream_replace_all_with(rdr, wtr, |mat, _, wtr| { |
768 | wtr.write_all(replace_with[mat.pattern()].as_ref()) |
769 | }) |
770 | } |
771 | |
772 | /// Search the given reader and replace all matches of this automaton |
773 | /// using the given closure. The result is written to the given |
774 | /// writer. Matches correspond to the same matches as reported by |
775 | /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). |
776 | /// |
777 | /// The closure accepts three parameters: the match found, the text of |
778 | /// the match and the writer with which to write the replaced text (if any). |
779 | /// |
780 | /// After all matches are replaced, the writer is _not_ flushed. |
781 | /// |
782 | /// If there was a problem reading from the given reader or writing to the |
783 | /// given writer, then the corresponding `io::Error` is returned and all |
784 | /// replacement is stopped. |
785 | /// |
786 | /// When searching a stream, an internal buffer is used. Therefore, callers |
787 | /// should avoiding providing a buffered reader, if possible. However, |
788 | /// callers may want to provide a buffered writer. |
789 | /// |
790 | /// Searching a stream requires that the automaton was built with |
791 | /// `MatchKind::Standard` semantics. If this automaton was constructed |
792 | /// with leftmost semantics, then this method will panic. To determine |
793 | /// whether this will panic at runtime, use the |
794 | /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream) |
795 | /// method. |
796 | /// |
797 | /// # Memory usage |
798 | /// |
799 | /// In general, searching streams will use a constant amount of memory for |
800 | /// its internal buffer. The one requirement is that the internal buffer |
801 | /// must be at least the size of the longest possible match. In most use |
802 | /// cases, the default buffer size will be much larger than any individual |
803 | /// match. |
804 | /// |
805 | /// # Panics |
806 | /// |
807 | /// This panics when `AhoCorasick::supports_stream` returns `false`. |
808 | /// That is, this panics when this automaton's match semantics are not |
809 | /// `MatchKind::Standard`. This restriction may be lifted in the future. |
810 | /// |
811 | /// # Examples |
812 | /// |
813 | /// Basic usage: |
814 | /// |
815 | /// ``` |
816 | /// use std::io::Write; |
817 | /// use aho_corasick::AhoCorasick; |
818 | /// |
819 | /// # fn example() -> Result<(), ::std::io::Error> { |
820 | /// let patterns = &["fox" , "brown" , "quick" ]; |
821 | /// let haystack = "The quick brown fox." ; |
822 | /// |
823 | /// let ac = AhoCorasick::new(patterns); |
824 | /// let mut result = vec![]; |
825 | /// ac.stream_replace_all_with( |
826 | /// haystack.as_bytes(), |
827 | /// &mut result, |
828 | /// |mat, _, wtr| { |
829 | /// wtr.write_all(mat.pattern().to_string().as_bytes()) |
830 | /// }, |
831 | /// )?; |
832 | /// assert_eq!(b"The 2 1 0." .to_vec(), result); |
833 | /// # Ok(()) }; example().unwrap() |
834 | /// ``` |
835 | pub fn stream_replace_all_with<R, W, F>( |
836 | &self, |
837 | rdr: R, |
838 | mut wtr: W, |
839 | mut replace_with: F, |
840 | ) -> io::Result<()> |
841 | where |
842 | R: io::Read, |
843 | W: io::Write, |
844 | F: FnMut(&Match, &[u8], &mut W) -> io::Result<()>, |
845 | { |
846 | let mut it = StreamChunkIter::new(self, rdr); |
847 | while let Some(result) = it.next() { |
848 | let chunk = result?; |
849 | match chunk { |
850 | StreamChunk::NonMatch { bytes, .. } => { |
851 | wtr.write_all(bytes)?; |
852 | } |
853 | StreamChunk::Match { bytes, mat } => { |
854 | replace_with(&mat, bytes, &mut wtr)?; |
855 | } |
856 | } |
857 | } |
858 | Ok(()) |
859 | } |
860 | |
861 | /// Returns the match kind used by this automaton. |
862 | /// |
863 | /// # Examples |
864 | /// |
865 | /// Basic usage: |
866 | /// |
867 | /// ``` |
868 | /// use aho_corasick::{AhoCorasick, MatchKind}; |
869 | /// |
870 | /// let ac = AhoCorasick::new(&[ |
871 | /// "foo" , "bar" , "quux" , "baz" , |
872 | /// ]); |
873 | /// assert_eq!(&MatchKind::Standard, ac.match_kind()); |
874 | /// ``` |
875 | pub fn match_kind(&self) -> &MatchKind { |
876 | self.imp.match_kind() |
877 | } |
878 | |
879 | /// Returns the length of the longest pattern matched by this automaton. |
880 | /// |
881 | /// # Examples |
882 | /// |
883 | /// Basic usage: |
884 | /// |
885 | /// ``` |
886 | /// use aho_corasick::AhoCorasick; |
887 | /// |
888 | /// let ac = AhoCorasick::new(&[ |
889 | /// "foo" , "bar" , "quux" , "baz" , |
890 | /// ]); |
891 | /// assert_eq!(4, ac.max_pattern_len()); |
892 | /// ``` |
893 | pub fn max_pattern_len(&self) -> usize { |
894 | self.imp.max_pattern_len() |
895 | } |
896 | |
897 | /// Return the total number of patterns matched by this automaton. |
898 | /// |
899 | /// This includes patterns that may never participate in a match. For |
900 | /// example, if |
901 | /// [`MatchKind::LeftmostFirst`](enum.MatchKind.html#variant.LeftmostFirst) |
902 | /// match semantics are used, and the patterns `Sam` and `Samwise` were |
903 | /// used to build the automaton, then `Samwise` can never participate in a |
904 | /// match because `Sam` will always take priority. |
905 | /// |
906 | /// # Examples |
907 | /// |
908 | /// Basic usage: |
909 | /// |
910 | /// ``` |
911 | /// use aho_corasick::AhoCorasick; |
912 | /// |
913 | /// let ac = AhoCorasick::new(&[ |
914 | /// "foo" , "bar" , "baz" , |
915 | /// ]); |
916 | /// assert_eq!(3, ac.pattern_count()); |
917 | /// ``` |
918 | pub fn pattern_count(&self) -> usize { |
919 | self.imp.pattern_count() |
920 | } |
921 | |
922 | /// Returns true if and only if this automaton supports reporting |
923 | /// overlapping matches. |
924 | /// |
925 | /// If this returns false and overlapping matches are requested, then it |
926 | /// will result in a panic. |
927 | /// |
928 | /// Since leftmost matching is inherently incompatible with overlapping |
929 | /// matches, only |
930 | /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard) |
931 | /// supports overlapping matches. This is unlikely to change in the future. |
932 | /// |
933 | /// # Examples |
934 | /// |
935 | /// Basic usage: |
936 | /// |
937 | /// ``` |
938 | /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; |
939 | /// |
940 | /// let ac = AhoCorasickBuilder::new() |
941 | /// .match_kind(MatchKind::Standard) |
942 | /// .build(&["foo" , "bar" , "baz" ]); |
943 | /// assert!(ac.supports_overlapping()); |
944 | /// |
945 | /// let ac = AhoCorasickBuilder::new() |
946 | /// .match_kind(MatchKind::LeftmostFirst) |
947 | /// .build(&["foo" , "bar" , "baz" ]); |
948 | /// assert!(!ac.supports_overlapping()); |
949 | /// ``` |
950 | pub fn supports_overlapping(&self) -> bool { |
951 | self.match_kind.supports_overlapping() |
952 | } |
953 | |
954 | /// Returns true if and only if this automaton supports stream searching. |
955 | /// |
956 | /// If this returns false and stream searching (or replacing) is attempted, |
957 | /// then it will result in a panic. |
958 | /// |
959 | /// Currently, only |
960 | /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard) |
961 | /// supports streaming. This may be expanded in the future. |
962 | /// |
963 | /// # Examples |
964 | /// |
965 | /// Basic usage: |
966 | /// |
967 | /// ``` |
968 | /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; |
969 | /// |
970 | /// let ac = AhoCorasickBuilder::new() |
971 | /// .match_kind(MatchKind::Standard) |
972 | /// .build(&["foo" , "bar" , "baz" ]); |
973 | /// assert!(ac.supports_stream()); |
974 | /// |
975 | /// let ac = AhoCorasickBuilder::new() |
976 | /// .match_kind(MatchKind::LeftmostFirst) |
977 | /// .build(&["foo" , "bar" , "baz" ]); |
978 | /// assert!(!ac.supports_stream()); |
979 | /// ``` |
980 | pub fn supports_stream(&self) -> bool { |
981 | self.match_kind.supports_stream() |
982 | } |
983 | |
984 | /// Returns the approximate total amount of heap used by this automaton, in |
985 | /// units of bytes. |
986 | /// |
987 | /// # Examples |
988 | /// |
989 | /// This example shows the difference in heap usage between a few |
990 | /// configurations: |
991 | /// |
992 | /// ```ignore |
993 | /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; |
994 | /// |
995 | /// let ac = AhoCorasickBuilder::new() |
996 | /// .dfa(false) // default |
997 | /// .build(&["foo" , "bar" , "baz" ]); |
998 | /// assert_eq!(10_336, ac.heap_bytes()); |
999 | /// |
1000 | /// let ac = AhoCorasickBuilder::new() |
1001 | /// .dfa(false) // default |
1002 | /// .ascii_case_insensitive(true) |
1003 | /// .build(&["foo" , "bar" , "baz" ]); |
1004 | /// assert_eq!(10_384, ac.heap_bytes()); |
1005 | /// |
1006 | /// let ac = AhoCorasickBuilder::new() |
1007 | /// .dfa(true) |
1008 | /// .ascii_case_insensitive(true) |
1009 | /// .build(&["foo" , "bar" , "baz" ]); |
1010 | /// assert_eq!(1_248, ac.heap_bytes()); |
1011 | /// ``` |
1012 | pub fn heap_bytes(&self) -> usize { |
1013 | match self.imp { |
1014 | Imp::NFA(ref nfa) => nfa.heap_bytes(), |
1015 | Imp::DFA(ref dfa) => dfa.heap_bytes(), |
1016 | } |
1017 | } |
1018 | } |
1019 | |
1020 | /// The internal implementation of Aho-Corasick, which is either an NFA or |
1021 | /// a DFA. The NFA is slower but uses less memory. The DFA is faster but uses |
1022 | /// more memory. |
1023 | #[derive (Clone, Debug)] |
1024 | enum Imp<S: StateID> { |
1025 | NFA(NFA<S>), |
1026 | DFA(DFA<S>), |
1027 | } |
1028 | |
1029 | impl<S: StateID> Imp<S> { |
1030 | /// Returns the type of match semantics implemented by this automaton. |
1031 | fn match_kind(&self) -> &MatchKind { |
1032 | match *self { |
1033 | Imp::NFA(ref nfa) => nfa.match_kind(), |
1034 | Imp::DFA(ref dfa) => dfa.match_kind(), |
1035 | } |
1036 | } |
1037 | |
1038 | /// Returns the identifier of the start state. |
1039 | fn start_state(&self) -> S { |
1040 | match *self { |
1041 | Imp::NFA(ref nfa) => nfa.start_state(), |
1042 | Imp::DFA(ref dfa) => dfa.start_state(), |
1043 | } |
1044 | } |
1045 | |
1046 | /// The length, in bytes, of the longest pattern in this automaton. This |
1047 | /// information is useful for maintaining correct buffer sizes when |
1048 | /// searching on streams. |
1049 | fn max_pattern_len(&self) -> usize { |
1050 | match *self { |
1051 | Imp::NFA(ref nfa) => nfa.max_pattern_len(), |
1052 | Imp::DFA(ref dfa) => dfa.max_pattern_len(), |
1053 | } |
1054 | } |
1055 | |
1056 | /// The total number of patterns added to this automaton. This includes |
1057 | /// patterns that may never match. The maximum matching pattern that can be |
1058 | /// reported is exactly one less than this number. |
1059 | fn pattern_count(&self) -> usize { |
1060 | match *self { |
1061 | Imp::NFA(ref nfa) => nfa.pattern_count(), |
1062 | Imp::DFA(ref dfa) => dfa.pattern_count(), |
1063 | } |
1064 | } |
1065 | |
1066 | /// Returns the prefilter object, if one exists, for the underlying |
1067 | /// automaton. |
1068 | fn prefilter(&self) -> Option<&dyn Prefilter> { |
1069 | match *self { |
1070 | Imp::NFA(ref nfa) => nfa.prefilter(), |
1071 | Imp::DFA(ref dfa) => dfa.prefilter(), |
1072 | } |
1073 | } |
1074 | |
1075 | /// Returns true if and only if we should attempt to use a prefilter. |
1076 | fn use_prefilter(&self) -> bool { |
1077 | let p = match self.prefilter() { |
1078 | None => return false, |
1079 | Some(p) => p, |
1080 | }; |
1081 | !p.looks_for_non_start_of_match() |
1082 | } |
1083 | |
1084 | #[inline (always)] |
1085 | fn overlapping_find_at( |
1086 | &self, |
1087 | prestate: &mut PrefilterState, |
1088 | haystack: &[u8], |
1089 | at: usize, |
1090 | state_id: &mut S, |
1091 | match_index: &mut usize, |
1092 | ) -> Option<Match> { |
1093 | match *self { |
1094 | Imp::NFA(ref nfa) => nfa.overlapping_find_at( |
1095 | prestate, |
1096 | haystack, |
1097 | at, |
1098 | state_id, |
1099 | match_index, |
1100 | ), |
1101 | Imp::DFA(ref dfa) => dfa.overlapping_find_at( |
1102 | prestate, |
1103 | haystack, |
1104 | at, |
1105 | state_id, |
1106 | match_index, |
1107 | ), |
1108 | } |
1109 | } |
1110 | |
1111 | #[inline (always)] |
1112 | fn earliest_find_at( |
1113 | &self, |
1114 | prestate: &mut PrefilterState, |
1115 | haystack: &[u8], |
1116 | at: usize, |
1117 | state_id: &mut S, |
1118 | ) -> Option<Match> { |
1119 | match *self { |
1120 | Imp::NFA(ref nfa) => { |
1121 | nfa.earliest_find_at(prestate, haystack, at, state_id) |
1122 | } |
1123 | Imp::DFA(ref dfa) => { |
1124 | dfa.earliest_find_at(prestate, haystack, at, state_id) |
1125 | } |
1126 | } |
1127 | } |
1128 | |
1129 | #[inline (always)] |
1130 | fn find_at_no_state( |
1131 | &self, |
1132 | prestate: &mut PrefilterState, |
1133 | haystack: &[u8], |
1134 | at: usize, |
1135 | ) -> Option<Match> { |
1136 | match *self { |
1137 | Imp::NFA(ref nfa) => nfa.find_at_no_state(prestate, haystack, at), |
1138 | Imp::DFA(ref dfa) => dfa.find_at_no_state(prestate, haystack, at), |
1139 | } |
1140 | } |
1141 | } |
1142 | |
1143 | /// An iterator of non-overlapping matches in a particular haystack. |
1144 | /// |
1145 | /// This iterator yields matches according to the |
1146 | /// [`MatchKind`](enum.MatchKind.html) |
1147 | /// used by this automaton. |
1148 | /// |
1149 | /// This iterator is constructed via the |
1150 | /// [`AhoCorasick::find_iter`](struct.AhoCorasick.html#method.find_iter) |
1151 | /// method. |
1152 | /// |
1153 | /// The type variable `S` refers to the representation used for state |
1154 | /// identifiers. (By default, this is `usize`.) |
1155 | /// |
1156 | /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. |
1157 | /// |
1158 | /// The lifetime `'b` refers to the lifetime of the haystack being searched. |
1159 | #[derive (Debug)] |
1160 | pub struct FindIter<'a, 'b, S: StateID> { |
1161 | fsm: &'a Imp<S>, |
1162 | prestate: PrefilterState, |
1163 | haystack: &'b [u8], |
1164 | pos: usize, |
1165 | } |
1166 | |
1167 | impl<'a, 'b, S: StateID> FindIter<'a, 'b, S> { |
1168 | fn new(ac: &'a AhoCorasick<S>, haystack: &'b [u8]) -> FindIter<'a, 'b, S> { |
1169 | let prestate: PrefilterState = PrefilterState::new(max_match_len:ac.max_pattern_len()); |
1170 | FindIter { fsm: &ac.imp, prestate, haystack, pos: 0 } |
1171 | } |
1172 | } |
1173 | |
1174 | impl<'a, 'b, S: StateID> Iterator for FindIter<'a, 'b, S> { |
1175 | type Item = Match; |
1176 | |
1177 | fn next(&mut self) -> Option<Match> { |
1178 | if self.pos > self.haystack.len() { |
1179 | return None; |
1180 | } |
1181 | let result = self.fsm.find_at_no_state( |
1182 | &mut self.prestate, |
1183 | self.haystack, |
1184 | self.pos, |
1185 | ); |
1186 | let mat = match result { |
1187 | None => return None, |
1188 | Some(mat) => mat, |
1189 | }; |
1190 | if mat.end() == self.pos { |
1191 | // If the automaton can match the empty string and if we found an |
1192 | // empty match, then we need to forcefully move the position. |
1193 | self.pos += 1; |
1194 | } else { |
1195 | self.pos = mat.end(); |
1196 | } |
1197 | Some(mat) |
1198 | } |
1199 | } |
1200 | |
1201 | /// An iterator of overlapping matches in a particular haystack. |
1202 | /// |
1203 | /// This iterator will report all possible matches in a particular haystack, |
1204 | /// even when the matches overlap. |
1205 | /// |
1206 | /// This iterator is constructed via the |
1207 | /// [`AhoCorasick::find_overlapping_iter`](struct.AhoCorasick.html#method.find_overlapping_iter) |
1208 | /// method. |
1209 | /// |
1210 | /// The type variable `S` refers to the representation used for state |
1211 | /// identifiers. (By default, this is `usize`.) |
1212 | /// |
1213 | /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. |
1214 | /// |
1215 | /// The lifetime `'b` refers to the lifetime of the haystack being searched. |
1216 | #[derive (Debug)] |
1217 | pub struct FindOverlappingIter<'a, 'b, S: StateID> { |
1218 | fsm: &'a Imp<S>, |
1219 | prestate: PrefilterState, |
1220 | haystack: &'b [u8], |
1221 | pos: usize, |
1222 | state_id: S, |
1223 | match_index: usize, |
1224 | } |
1225 | |
1226 | impl<'a, 'b, S: StateID> FindOverlappingIter<'a, 'b, S> { |
1227 | fn new( |
1228 | ac: &'a AhoCorasick<S>, |
1229 | haystack: &'b [u8], |
1230 | ) -> FindOverlappingIter<'a, 'b, S> { |
1231 | assert!( |
1232 | ac.supports_overlapping(), |
1233 | "automaton does not support overlapping searches" |
1234 | ); |
1235 | let prestate: PrefilterState = PrefilterState::new(max_match_len:ac.max_pattern_len()); |
1236 | FindOverlappingIter { |
1237 | fsm: &ac.imp, |
1238 | prestate, |
1239 | haystack, |
1240 | pos: 0, |
1241 | state_id: ac.imp.start_state(), |
1242 | match_index: 0, |
1243 | } |
1244 | } |
1245 | } |
1246 | |
1247 | impl<'a, 'b, S: StateID> Iterator for FindOverlappingIter<'a, 'b, S> { |
1248 | type Item = Match; |
1249 | |
1250 | fn next(&mut self) -> Option<Match> { |
1251 | let result: Option = self.fsm.overlapping_find_at( |
1252 | &mut self.prestate, |
1253 | self.haystack, |
1254 | self.pos, |
1255 | &mut self.state_id, |
1256 | &mut self.match_index, |
1257 | ); |
1258 | match result { |
1259 | None => return None, |
1260 | Some(m: Match) => { |
1261 | self.pos = m.end(); |
1262 | Some(m) |
1263 | } |
1264 | } |
1265 | } |
1266 | } |
1267 | |
1268 | /// An iterator that reports Aho-Corasick matches in a stream. |
1269 | /// |
1270 | /// This iterator yields elements of type `io::Result<Match>`, where an error |
1271 | /// is reported if there was a problem reading from the underlying stream. |
1272 | /// The iterator terminates only when the underlying stream reaches `EOF`. |
1273 | /// |
1274 | /// This iterator is constructed via the |
1275 | /// [`AhoCorasick::stream_find_iter`](struct.AhoCorasick.html#method.stream_find_iter) |
1276 | /// method. |
1277 | /// |
1278 | /// The type variable `R` refers to the `io::Read` stream that is being read |
1279 | /// from. |
1280 | /// |
1281 | /// The type variable `S` refers to the representation used for state |
1282 | /// identifiers. (By default, this is `usize`.) |
1283 | /// |
1284 | /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. |
1285 | #[derive (Debug)] |
1286 | pub struct StreamFindIter<'a, R, S: StateID> { |
1287 | it: StreamChunkIter<'a, R, S>, |
1288 | } |
1289 | |
1290 | impl<'a, R: io::Read, S: StateID> StreamFindIter<'a, R, S> { |
1291 | fn new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamFindIter<'a, R, S> { |
1292 | StreamFindIter { it: StreamChunkIter::new(ac, rdr) } |
1293 | } |
1294 | } |
1295 | |
1296 | impl<'a, R: io::Read, S: StateID> Iterator for StreamFindIter<'a, R, S> { |
1297 | type Item = io::Result<Match>; |
1298 | |
1299 | fn next(&mut self) -> Option<io::Result<Match>> { |
1300 | loop { |
1301 | match self.it.next() { |
1302 | None => return None, |
1303 | Some(Err(err: Error)) => return Some(Err(err)), |
1304 | Some(Ok(StreamChunk::NonMatch { .. })) => {} |
1305 | Some(Ok(StreamChunk::Match { mat: Match, .. })) => { |
1306 | return Some(Ok(mat)); |
1307 | } |
1308 | } |
1309 | } |
1310 | } |
1311 | } |
1312 | |
1313 | /// An iterator over chunks in an underlying reader. Each chunk either |
1314 | /// corresponds to non-matching bytes or matching bytes, but all bytes from |
1315 | /// the underlying reader are reported in sequence. There may be an arbitrary |
1316 | /// number of non-matching chunks before seeing a matching chunk. |
1317 | /// |
1318 | /// N.B. This does not actually implement Iterator because we need to borrow |
1319 | /// from the underlying reader. But conceptually, it's still an iterator. |
1320 | #[derive (Debug)] |
1321 | struct StreamChunkIter<'a, R, S: StateID> { |
1322 | /// The AC automaton. |
1323 | fsm: &'a Imp<S>, |
1324 | /// State associated with this automaton's prefilter. It is a heuristic |
1325 | /// for stopping the prefilter if it's deemed ineffective. |
1326 | prestate: PrefilterState, |
1327 | /// The source of bytes we read from. |
1328 | rdr: R, |
1329 | /// A fixed size buffer. This is what we actually search. There are some |
1330 | /// invariants around the buffer's size, namely, it must be big enough to |
1331 | /// contain the longest possible match. |
1332 | buf: Buffer, |
1333 | /// The ID of the FSM state we're currently in. |
1334 | state_id: S, |
1335 | /// The current position at which to start the next search in `buf`. |
1336 | search_pos: usize, |
1337 | /// The absolute position of `search_pos`, where `0` corresponds to the |
1338 | /// position of the first byte read from `rdr`. |
1339 | absolute_pos: usize, |
1340 | /// The ending position of the last StreamChunk that was returned to the |
1341 | /// caller. This position is used to determine whether we need to emit |
1342 | /// non-matching bytes before emitting a match. |
1343 | report_pos: usize, |
1344 | /// A match that should be reported on the next call. |
1345 | pending_match: Option<Match>, |
1346 | /// Enabled only when the automaton can match the empty string. When |
1347 | /// enabled, we need to execute one final search after consuming the |
1348 | /// reader to find the trailing empty match. |
1349 | has_empty_match_at_end: bool, |
1350 | } |
1351 | |
1352 | /// A single chunk yielded by the stream chunk iterator. |
1353 | /// |
1354 | /// The `'r` lifetime refers to the lifetime of the stream chunk iterator. |
1355 | #[derive (Debug)] |
1356 | enum StreamChunk<'r> { |
1357 | /// A chunk that does not contain any matches. |
1358 | NonMatch { bytes: &'r [u8] }, |
1359 | /// A chunk that precisely contains a match. |
1360 | Match { bytes: &'r [u8], mat: Match }, |
1361 | } |
1362 | |
1363 | impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> { |
1364 | fn new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamChunkIter<'a, R, S> { |
1365 | assert!( |
1366 | ac.supports_stream(), |
1367 | "stream searching is only supported for Standard match semantics" |
1368 | ); |
1369 | |
1370 | let prestate = if ac.imp.use_prefilter() { |
1371 | PrefilterState::new(ac.max_pattern_len()) |
1372 | } else { |
1373 | PrefilterState::disabled() |
1374 | }; |
1375 | let buf = Buffer::new(ac.imp.max_pattern_len()); |
1376 | let state_id = ac.imp.start_state(); |
1377 | StreamChunkIter { |
1378 | fsm: &ac.imp, |
1379 | prestate, |
1380 | rdr, |
1381 | buf, |
1382 | state_id, |
1383 | absolute_pos: 0, |
1384 | report_pos: 0, |
1385 | search_pos: 0, |
1386 | pending_match: None, |
1387 | has_empty_match_at_end: ac.is_match("" ), |
1388 | } |
1389 | } |
1390 | |
1391 | fn next(&mut self) -> Option<io::Result<StreamChunk>> { |
1392 | loop { |
1393 | if let Some(mut mat) = self.pending_match.take() { |
1394 | let bytes = &self.buf.buffer()[mat.start()..mat.end()]; |
1395 | self.report_pos = mat.end(); |
1396 | mat = mat.increment(self.absolute_pos); |
1397 | return Some(Ok(StreamChunk::Match { bytes, mat })); |
1398 | } |
1399 | if self.search_pos >= self.buf.len() { |
1400 | if let Some(end) = self.unreported() { |
1401 | let bytes = &self.buf.buffer()[self.report_pos..end]; |
1402 | self.report_pos = end; |
1403 | return Some(Ok(StreamChunk::NonMatch { bytes })); |
1404 | } |
1405 | if self.buf.len() >= self.buf.min_buffer_len() { |
1406 | // This is the point at which we roll our buffer, which we |
1407 | // only do if our buffer has at least the minimum amount of |
1408 | // bytes in it. Before rolling, we update our various |
1409 | // positions to be consistent with the buffer after it has |
1410 | // been rolled. |
1411 | |
1412 | self.report_pos -= |
1413 | self.buf.len() - self.buf.min_buffer_len(); |
1414 | self.absolute_pos += |
1415 | self.search_pos - self.buf.min_buffer_len(); |
1416 | self.search_pos = self.buf.min_buffer_len(); |
1417 | self.buf.roll(); |
1418 | } |
1419 | match self.buf.fill(&mut self.rdr) { |
1420 | Err(err) => return Some(Err(err)), |
1421 | Ok(false) => { |
1422 | // We've hit EOF, but if there are still some |
1423 | // unreported bytes remaining, return them now. |
1424 | if self.report_pos < self.buf.len() { |
1425 | let bytes = &self.buf.buffer()[self.report_pos..]; |
1426 | self.report_pos = self.buf.len(); |
1427 | |
1428 | let chunk = StreamChunk::NonMatch { bytes }; |
1429 | return Some(Ok(chunk)); |
1430 | } else { |
1431 | // We've reported everything, but there might still |
1432 | // be a match at the very last position. |
1433 | if !self.has_empty_match_at_end { |
1434 | return None; |
1435 | } |
1436 | // fallthrough for another search to get trailing |
1437 | // empty matches |
1438 | self.has_empty_match_at_end = false; |
1439 | } |
1440 | } |
1441 | Ok(true) => {} |
1442 | } |
1443 | } |
1444 | let result = self.fsm.earliest_find_at( |
1445 | &mut self.prestate, |
1446 | self.buf.buffer(), |
1447 | self.search_pos, |
1448 | &mut self.state_id, |
1449 | ); |
1450 | match result { |
1451 | None => { |
1452 | self.search_pos = self.buf.len(); |
1453 | } |
1454 | Some(mat) => { |
1455 | self.state_id = self.fsm.start_state(); |
1456 | if mat.end() == self.search_pos { |
1457 | // If the automaton can match the empty string and if |
1458 | // we found an empty match, then we need to forcefully |
1459 | // move the position. |
1460 | self.search_pos += 1; |
1461 | } else { |
1462 | self.search_pos = mat.end(); |
1463 | } |
1464 | self.pending_match = Some(mat.clone()); |
1465 | if self.report_pos < mat.start() { |
1466 | let bytes = |
1467 | &self.buf.buffer()[self.report_pos..mat.start()]; |
1468 | self.report_pos = mat.start(); |
1469 | |
1470 | let chunk = StreamChunk::NonMatch { bytes }; |
1471 | return Some(Ok(chunk)); |
1472 | } |
1473 | } |
1474 | } |
1475 | } |
1476 | } |
1477 | |
1478 | fn unreported(&self) -> Option<usize> { |
1479 | let end = self.search_pos.saturating_sub(self.buf.min_buffer_len()); |
1480 | if self.report_pos < end { |
1481 | Some(end) |
1482 | } else { |
1483 | None |
1484 | } |
1485 | } |
1486 | } |
1487 | |
1488 | /// A builder for configuring an Aho-Corasick automaton. |
1489 | #[derive (Clone, Debug)] |
1490 | pub struct AhoCorasickBuilder { |
1491 | nfa_builder: nfa::Builder, |
1492 | dfa_builder: dfa::Builder, |
1493 | dfa: bool, |
1494 | } |
1495 | |
1496 | impl Default for AhoCorasickBuilder { |
1497 | fn default() -> AhoCorasickBuilder { |
1498 | AhoCorasickBuilder::new() |
1499 | } |
1500 | } |
1501 | |
1502 | impl AhoCorasickBuilder { |
1503 | /// Create a new builder for configuring an Aho-Corasick automaton. |
1504 | /// |
1505 | /// If you don't need fine grained configuration or aren't sure which knobs |
1506 | /// to set, try using |
1507 | /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured) |
1508 | /// instead. |
1509 | pub fn new() -> AhoCorasickBuilder { |
1510 | AhoCorasickBuilder { |
1511 | nfa_builder: nfa::Builder::new(), |
1512 | dfa_builder: dfa::Builder::new(), |
1513 | dfa: false, |
1514 | } |
1515 | } |
1516 | |
1517 | /// Build an Aho-Corasick automaton using the configuration set on this |
1518 | /// builder. |
1519 | /// |
1520 | /// A builder may be reused to create more automatons. |
1521 | /// |
1522 | /// This method will use the default for representing internal state |
1523 | /// identifiers, which is `usize`. This guarantees that building the |
1524 | /// automaton will succeed and is generally a good default, but can make |
1525 | /// the size of the automaton 2-8 times bigger than it needs to be, |
1526 | /// depending on your target platform. |
1527 | /// |
1528 | /// # Examples |
1529 | /// |
1530 | /// Basic usage: |
1531 | /// |
1532 | /// ``` |
1533 | /// use aho_corasick::AhoCorasickBuilder; |
1534 | /// |
1535 | /// let patterns = &["foo" , "bar" , "baz" ]; |
1536 | /// let ac = AhoCorasickBuilder::new() |
1537 | /// .build(patterns); |
1538 | /// assert_eq!(Some(1), ac.find("xxx bar xxx" ).map(|m| m.pattern())); |
1539 | /// ``` |
1540 | pub fn build<I, P>(&self, patterns: I) -> AhoCorasick |
1541 | where |
1542 | I: IntoIterator<Item = P>, |
1543 | P: AsRef<[u8]>, |
1544 | { |
1545 | // The builder only returns an error if the chosen state ID |
1546 | // representation is too small to fit all of the given patterns. In |
1547 | // this case, since we fix the representation to usize, it will always |
1548 | // work because it's impossible to overflow usize since the underlying |
1549 | // storage would OOM long before that happens. |
1550 | self.build_with_size::<usize, I, P>(patterns) |
1551 | .expect("usize state ID type should always work" ) |
1552 | } |
1553 | |
1554 | /// Build an Aho-Corasick automaton using the configuration set on this |
1555 | /// builder with a specific state identifier representation. This only has |
1556 | /// an effect when the `dfa` option is enabled. |
1557 | /// |
1558 | /// Generally, the choices for a state identifier representation are |
1559 | /// `u8`, `u16`, `u32`, `u64` or `usize`, with `usize` being the default. |
1560 | /// The advantage of choosing a smaller state identifier representation |
1561 | /// is that the automaton produced will be smaller. This might be |
1562 | /// beneficial for just generally using less space, or might even allow it |
1563 | /// to fit more of the automaton in your CPU's cache, leading to overall |
1564 | /// better search performance. |
1565 | /// |
1566 | /// Unlike the standard `build` method, this can report an error if the |
1567 | /// state identifier representation cannot support the size of the |
1568 | /// automaton. |
1569 | /// |
1570 | /// Note that the state identifier representation is determined by the |
1571 | /// `S` type variable. This requires a type hint of some sort, either |
1572 | /// by specifying the return type or using the turbofish, e.g., |
1573 | /// `build_with_size::<u16, _, _>(...)`. |
1574 | /// |
1575 | /// # Examples |
1576 | /// |
1577 | /// Basic usage: |
1578 | /// |
1579 | /// ``` |
1580 | /// use aho_corasick::{AhoCorasick, AhoCorasickBuilder}; |
1581 | /// |
1582 | /// # fn example() -> Result<(), ::aho_corasick::Error> { |
1583 | /// let patterns = &["foo" , "bar" , "baz" ]; |
1584 | /// let ac: AhoCorasick<u8> = AhoCorasickBuilder::new() |
1585 | /// .build_with_size(patterns)?; |
1586 | /// assert_eq!(Some(1), ac.find("xxx bar xxx" ).map(|m| m.pattern())); |
1587 | /// # Ok(()) }; example().unwrap() |
1588 | /// ``` |
1589 | /// |
1590 | /// Or alternatively, with turbofish: |
1591 | /// |
1592 | /// ``` |
1593 | /// use aho_corasick::AhoCorasickBuilder; |
1594 | /// |
1595 | /// # fn example() -> Result<(), ::aho_corasick::Error> { |
1596 | /// let patterns = &["foo" , "bar" , "baz" ]; |
1597 | /// let ac = AhoCorasickBuilder::new() |
1598 | /// .build_with_size::<u8, _, _>(patterns)?; |
1599 | /// assert_eq!(Some(1), ac.find("xxx bar xxx" ).map(|m| m.pattern())); |
1600 | /// # Ok(()) }; example().unwrap() |
1601 | /// ``` |
1602 | pub fn build_with_size<S, I, P>( |
1603 | &self, |
1604 | patterns: I, |
1605 | ) -> Result<AhoCorasick<S>> |
1606 | where |
1607 | S: StateID, |
1608 | I: IntoIterator<Item = P>, |
1609 | P: AsRef<[u8]>, |
1610 | { |
1611 | let nfa = self.nfa_builder.build(patterns)?; |
1612 | let match_kind = nfa.match_kind().clone(); |
1613 | let imp = if self.dfa { |
1614 | let dfa = self.dfa_builder.build(&nfa)?; |
1615 | Imp::DFA(dfa) |
1616 | } else { |
1617 | Imp::NFA(nfa) |
1618 | }; |
1619 | Ok(AhoCorasick { imp, match_kind }) |
1620 | } |
1621 | |
1622 | /// Automatically configure the settings on this builder according to the |
1623 | /// patterns that will be used to construct the automaton. |
1624 | /// |
1625 | /// The idea here is to balance space and time automatically. That is, when |
1626 | /// searching a small number of patterns, this will attempt to use the |
1627 | /// fastest possible configuration since the total space required will be |
1628 | /// small anyway. As the number of patterns grows, this will fall back to |
1629 | /// slower configurations that use less space. |
1630 | /// |
1631 | /// This is guaranteed to never set `match_kind`, but any other option may |
1632 | /// be overridden. |
1633 | /// |
1634 | /// # Examples |
1635 | /// |
1636 | /// Basic usage: |
1637 | /// |
1638 | /// ``` |
1639 | /// use aho_corasick::AhoCorasickBuilder; |
1640 | /// |
1641 | /// let patterns = &["foo" , "bar" , "baz" ]; |
1642 | /// let ac = AhoCorasickBuilder::new() |
1643 | /// .auto_configure(patterns) |
1644 | /// .build(patterns); |
1645 | /// assert_eq!(Some(1), ac.find("xxx bar xxx" ).map(|m| m.pattern())); |
1646 | /// ``` |
1647 | pub fn auto_configure<B: AsRef<[u8]>>( |
1648 | &mut self, |
1649 | patterns: &[B], |
1650 | ) -> &mut AhoCorasickBuilder { |
1651 | // N.B. Currently we only use the length of `patterns` to make a |
1652 | // decision here, and could therefore ask for an `ExactSizeIterator` |
1653 | // instead. But it's conceivable that we might adapt this to look at |
1654 | // the total number of bytes, which would requires a second pass. |
1655 | // |
1656 | // The logic here is fairly rudimentary at the moment, but probably |
1657 | // OK. The idea here is to use the fastest thing possible for a small |
1658 | // number of patterns. That is, a DFA with no byte classes, since byte |
1659 | // classes require an extra indirection for every byte searched. With a |
1660 | // moderate number of patterns, we still want a DFA, but save on both |
1661 | // space and compilation time by enabling byte classes. Finally, fall |
1662 | // back to the slower but smaller NFA. |
1663 | if patterns.len() <= 100 { |
1664 | // N.B. Using byte classes can actually be faster by improving |
1665 | // locality, but this only really applies for multi-megabyte |
1666 | // automata (i.e., automata that don't fit in your CPU's cache). |
1667 | self.dfa(true); |
1668 | } else if patterns.len() <= 5000 { |
1669 | self.dfa(true); |
1670 | } |
1671 | self |
1672 | } |
1673 | |
1674 | /// Set the desired match semantics. |
1675 | /// |
1676 | /// The default is `MatchKind::Standard`, which corresponds to the match |
1677 | /// semantics supported by the standard textbook description of the |
1678 | /// Aho-Corasick algorithm. Namely, matches are reported as soon as they |
1679 | /// are found. Moreover, this is the only way to get overlapping matches |
1680 | /// or do stream searching. |
1681 | /// |
1682 | /// The other kinds of match semantics that are supported are |
1683 | /// `MatchKind::LeftmostFirst` and `MatchKind::LeftmostLongest`. The former |
1684 | /// corresponds to the match you would get if you were to try to match |
1685 | /// each pattern at each position in the haystack in the same order that |
1686 | /// you give to the automaton. That is, it returns the leftmost match |
1687 | /// corresponding the earliest pattern given to the automaton. The latter |
1688 | /// corresponds to finding the longest possible match among all leftmost |
1689 | /// matches. |
1690 | /// |
1691 | /// For more details on match semantics, see the |
1692 | /// [documentation for `MatchKind`](enum.MatchKind.html). |
1693 | /// |
1694 | /// # Examples |
1695 | /// |
1696 | /// In these examples, we demonstrate the differences between match |
1697 | /// semantics for a particular set of patterns in a specific order: |
1698 | /// `b`, `abc`, `abcd`. |
1699 | /// |
1700 | /// Standard semantics: |
1701 | /// |
1702 | /// ``` |
1703 | /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; |
1704 | /// |
1705 | /// let patterns = &["b" , "abc" , "abcd" ]; |
1706 | /// let haystack = "abcd" ; |
1707 | /// |
1708 | /// let ac = AhoCorasickBuilder::new() |
1709 | /// .match_kind(MatchKind::Standard) // default, not necessary |
1710 | /// .build(patterns); |
1711 | /// let mat = ac.find(haystack).expect("should have a match" ); |
1712 | /// assert_eq!("b" , &haystack[mat.start()..mat.end()]); |
1713 | /// ``` |
1714 | /// |
1715 | /// Leftmost-first semantics: |
1716 | /// |
1717 | /// ``` |
1718 | /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; |
1719 | /// |
1720 | /// let patterns = &["b" , "abc" , "abcd" ]; |
1721 | /// let haystack = "abcd" ; |
1722 | /// |
1723 | /// let ac = AhoCorasickBuilder::new() |
1724 | /// .match_kind(MatchKind::LeftmostFirst) |
1725 | /// .build(patterns); |
1726 | /// let mat = ac.find(haystack).expect("should have a match" ); |
1727 | /// assert_eq!("abc" , &haystack[mat.start()..mat.end()]); |
1728 | /// ``` |
1729 | /// |
1730 | /// Leftmost-longest semantics: |
1731 | /// |
1732 | /// ``` |
1733 | /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; |
1734 | /// |
1735 | /// let patterns = &["b" , "abc" , "abcd" ]; |
1736 | /// let haystack = "abcd" ; |
1737 | /// |
1738 | /// let ac = AhoCorasickBuilder::new() |
1739 | /// .match_kind(MatchKind::LeftmostLongest) |
1740 | /// .build(patterns); |
1741 | /// let mat = ac.find(haystack).expect("should have a match" ); |
1742 | /// assert_eq!("abcd" , &haystack[mat.start()..mat.end()]); |
1743 | /// ``` |
1744 | pub fn match_kind(&mut self, kind: MatchKind) -> &mut AhoCorasickBuilder { |
1745 | self.nfa_builder.match_kind(kind); |
1746 | self |
1747 | } |
1748 | |
1749 | /// Enable anchored mode, which requires all matches to start at the |
1750 | /// first position in a haystack. |
1751 | /// |
1752 | /// This option is disabled by default. |
1753 | /// |
1754 | /// # Examples |
1755 | /// |
1756 | /// Basic usage: |
1757 | /// |
1758 | /// ``` |
1759 | /// use aho_corasick::AhoCorasickBuilder; |
1760 | /// |
1761 | /// let patterns = &["foo" , "bar" ]; |
1762 | /// let haystack = "foobar" ; |
1763 | /// |
1764 | /// let ac = AhoCorasickBuilder::new() |
1765 | /// .anchored(true) |
1766 | /// .build(patterns); |
1767 | /// assert_eq!(1, ac.find_iter(haystack).count()); |
1768 | /// ``` |
1769 | /// |
1770 | /// When searching for overlapping matches, all matches that start at |
1771 | /// the beginning of a haystack will be reported: |
1772 | /// |
1773 | /// ``` |
1774 | /// use aho_corasick::AhoCorasickBuilder; |
1775 | /// |
1776 | /// let patterns = &["foo" , "foofoo" ]; |
1777 | /// let haystack = "foofoo" ; |
1778 | /// |
1779 | /// let ac = AhoCorasickBuilder::new() |
1780 | /// .anchored(true) |
1781 | /// .build(patterns); |
1782 | /// assert_eq!(2, ac.find_overlapping_iter(haystack).count()); |
1783 | /// // A non-anchored search would return 3 matches. |
1784 | /// ``` |
1785 | pub fn anchored(&mut self, yes: bool) -> &mut AhoCorasickBuilder { |
1786 | self.nfa_builder.anchored(yes); |
1787 | self |
1788 | } |
1789 | |
1790 | /// Enable ASCII-aware case insensitive matching. |
1791 | /// |
1792 | /// When this option is enabled, searching will be performed without |
1793 | /// respect to case for ASCII letters (`a-z` and `A-Z`) only. |
1794 | /// |
1795 | /// Enabling this option does not change the search algorithm, but it may |
1796 | /// increase the size of the automaton. |
1797 | /// |
1798 | /// **NOTE:** It is unlikely that support for Unicode case folding will |
1799 | /// be added in the future. The ASCII case works via a simple hack to the |
1800 | /// underlying automaton, but full Unicode handling requires a fair bit of |
1801 | /// sophistication. If you do need Unicode handling, you might consider |
1802 | /// using the [`regex` crate](https://docs.rs/regex) or the lower level |
1803 | /// [`regex-automata` crate](https://docs.rs/regex-automata). |
1804 | /// |
1805 | /// # Examples |
1806 | /// |
1807 | /// Basic usage: |
1808 | /// |
1809 | /// ``` |
1810 | /// use aho_corasick::AhoCorasickBuilder; |
1811 | /// |
1812 | /// let patterns = &["FOO" , "bAr" , "BaZ" ]; |
1813 | /// let haystack = "foo bar baz" ; |
1814 | /// |
1815 | /// let ac = AhoCorasickBuilder::new() |
1816 | /// .ascii_case_insensitive(true) |
1817 | /// .build(patterns); |
1818 | /// assert_eq!(3, ac.find_iter(haystack).count()); |
1819 | /// ``` |
1820 | pub fn ascii_case_insensitive( |
1821 | &mut self, |
1822 | yes: bool, |
1823 | ) -> &mut AhoCorasickBuilder { |
1824 | self.nfa_builder.ascii_case_insensitive(yes); |
1825 | self |
1826 | } |
1827 | |
1828 | /// Set the limit on how many NFA states use a dense representation for |
1829 | /// their transitions. |
1830 | /// |
1831 | /// A dense representation uses more space, but supports faster access to |
1832 | /// transitions at search time. Thus, this setting permits the control of a |
1833 | /// space vs time trade off when using the NFA variant of Aho-Corasick. |
1834 | /// |
1835 | /// This limit is expressed in terms of the depth of a state, i.e., the |
1836 | /// number of transitions from the starting state of the NFA. The idea is |
1837 | /// that most of the time searching will be spent near the starting state |
1838 | /// of the automaton, so states near the start state should use a dense |
1839 | /// representation. States further away from the start state would then use |
1840 | /// a sparse representation, which uses less space but is slower to access |
1841 | /// transitions at search time. |
1842 | /// |
1843 | /// By default, this is set to a low but non-zero number. |
1844 | /// |
1845 | /// This setting has no effect if the `dfa` option is enabled. |
1846 | pub fn dense_depth(&mut self, depth: usize) -> &mut AhoCorasickBuilder { |
1847 | self.nfa_builder.dense_depth(depth); |
1848 | self |
1849 | } |
1850 | |
1851 | /// Compile the standard Aho-Corasick automaton into a deterministic finite |
1852 | /// automaton (DFA). |
1853 | /// |
1854 | /// When this is disabled (which is the default), then a non-deterministic |
1855 | /// finite automaton (NFA) is used instead. |
1856 | /// |
1857 | /// The main benefit to a DFA is that it can execute searches more quickly |
1858 | /// than a NFA (perhaps 2-4 times as fast). The main drawback is that the |
1859 | /// DFA uses more space and can take much longer to build. |
1860 | /// |
1861 | /// Enabling this option does not change the time complexity for |
1862 | /// constructing the Aho-Corasick automaton (which is `O(p)` where |
1863 | /// `p` is the total number of patterns being compiled). Enabling this |
1864 | /// option does however reduce the time complexity of non-overlapping |
1865 | /// searches from `O(n + p)` to `O(n)`, where `n` is the length of the |
1866 | /// haystack. |
1867 | /// |
1868 | /// In general, it's a good idea to enable this if you're searching a |
1869 | /// small number of fairly short patterns (~1000), or if you want the |
1870 | /// fastest possible search without regard to compilation time or space |
1871 | /// usage. |
1872 | pub fn dfa(&mut self, yes: bool) -> &mut AhoCorasickBuilder { |
1873 | self.dfa = yes; |
1874 | self |
1875 | } |
1876 | |
1877 | /// Enable heuristic prefilter optimizations. |
1878 | /// |
1879 | /// When enabled, searching will attempt to quickly skip to match |
1880 | /// candidates using specialized literal search routines. A prefilter |
1881 | /// cannot always be used, and is generally treated as a heuristic. It |
1882 | /// can be useful to disable this if the prefilter is observed to be |
1883 | /// sub-optimal for a particular workload. |
1884 | /// |
1885 | /// This is enabled by default. |
1886 | pub fn prefilter(&mut self, yes: bool) -> &mut AhoCorasickBuilder { |
1887 | self.nfa_builder.prefilter(yes); |
1888 | self |
1889 | } |
1890 | |
1891 | /// Shrink the size of the transition alphabet by mapping bytes to their |
1892 | /// equivalence classes. This only has an effect when the `dfa` option is |
1893 | /// enabled. |
1894 | /// |
1895 | /// When enabled, each a DFA will use a map from all possible bytes |
1896 | /// to their corresponding equivalence class. Each equivalence class |
1897 | /// represents a set of bytes that does not discriminate between a match |
1898 | /// and a non-match in the DFA. For example, the patterns `bar` and `baz` |
1899 | /// have at least five equivalence classes: singleton sets of `b`, `a`, `r` |
1900 | /// and `z`, and a final set that contains every other byte. |
1901 | /// |
1902 | /// The advantage of this map is that the size of the transition table can |
1903 | /// be reduced drastically from `#states * 256 * sizeof(id)` to |
1904 | /// `#states * k * sizeof(id)` where `k` is the number of equivalence |
1905 | /// classes. As a result, total space usage can decrease substantially. |
1906 | /// Moreover, since a smaller alphabet is used, compilation becomes faster |
1907 | /// as well. |
1908 | /// |
1909 | /// The disadvantage of this map is that every byte searched must be |
1910 | /// passed through this map before it can be used to determine the next |
1911 | /// transition. This has a small match time performance cost. However, if |
1912 | /// the DFA is otherwise very large without byte classes, then using byte |
1913 | /// classes can greatly improve memory locality and thus lead to better |
1914 | /// overall performance. |
1915 | /// |
1916 | /// This option is enabled by default. |
1917 | #[deprecated ( |
1918 | since = "0.7.16" , |
1919 | note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57" |
1920 | )] |
1921 | pub fn byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder { |
1922 | self.dfa_builder.byte_classes(yes); |
1923 | self |
1924 | } |
1925 | |
1926 | /// Premultiply state identifiers in the transition table. This only has |
1927 | /// an effect when the `dfa` option is enabled. |
1928 | /// |
1929 | /// When enabled, state identifiers are premultiplied to point to their |
1930 | /// corresponding row in the transition table. That is, given the `i`th |
1931 | /// state, its corresponding premultiplied identifier is `i * k` where `k` |
1932 | /// is the alphabet size of the automaton. (The alphabet size is at most |
1933 | /// 256, but is in practice smaller if byte classes is enabled.) |
1934 | /// |
1935 | /// When state identifiers are not premultiplied, then the identifier of |
1936 | /// the `i`th state is `i`. |
1937 | /// |
1938 | /// The advantage of premultiplying state identifiers is that is saves a |
1939 | /// multiplication instruction per byte when searching with a DFA. This has |
1940 | /// been observed to lead to a 20% performance benefit in micro-benchmarks. |
1941 | /// |
1942 | /// The primary disadvantage of premultiplying state identifiers is |
1943 | /// that they require a larger integer size to represent. For example, |
1944 | /// if the DFA has 200 states, then its premultiplied form requires 16 |
1945 | /// bits to represent every possible state identifier, where as its |
1946 | /// non-premultiplied form only requires 8 bits. |
1947 | /// |
1948 | /// This option is enabled by default. |
1949 | #[deprecated ( |
1950 | since = "0.7.16" , |
1951 | note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57" |
1952 | )] |
1953 | pub fn premultiply(&mut self, yes: bool) -> &mut AhoCorasickBuilder { |
1954 | self.dfa_builder.premultiply(yes); |
1955 | self |
1956 | } |
1957 | } |
1958 | |
1959 | /// A knob for controlling the match semantics of an Aho-Corasick automaton. |
1960 | /// |
1961 | /// There are two generally different ways that Aho-Corasick automatons can |
1962 | /// report matches. The first way is the "standard" approach that results from |
1963 | /// implementing most textbook explanations of Aho-Corasick. The second way is |
1964 | /// to report only the leftmost non-overlapping matches. The leftmost approach |
1965 | /// is in turn split into two different ways of resolving ambiguous matches: |
1966 | /// leftmost-first and leftmost-longest. |
1967 | /// |
1968 | /// The `Standard` match kind is the default and is the only one that supports |
1969 | /// overlapping matches and stream searching. (Trying to find overlapping |
1970 | /// or streaming matches using leftmost match semantics will result in a |
1971 | /// panic.) The `Standard` match kind will report matches as they are seen. |
1972 | /// When searching for overlapping matches, then all possible matches are |
1973 | /// reported. When searching for non-overlapping matches, the first match seen |
1974 | /// is reported. For example, for non-overlapping matches, given the patterns |
1975 | /// `abcd` and `b` and the subject string `abcdef`, only a match for `b` is |
1976 | /// reported since it is detected first. The `abcd` match is never reported |
1977 | /// since it overlaps with the `b` match. |
1978 | /// |
1979 | /// In contrast, the leftmost match kind always prefers the leftmost match |
1980 | /// among all possible matches. Given the same example as above with `abcd` and |
1981 | /// `b` as patterns and `abcdef` as the subject string, the leftmost match is |
1982 | /// `abcd` since it begins before the `b` match, even though the `b` match is |
1983 | /// detected before the `abcd` match. In this case, the `b` match is not |
1984 | /// reported at all since it overlaps with the `abcd` match. |
1985 | /// |
1986 | /// The difference between leftmost-first and leftmost-longest is in how they |
1987 | /// resolve ambiguous matches when there are multiple leftmost matches to |
1988 | /// choose from. Leftmost-first always chooses the pattern that was provided |
1989 | /// earliest, where as leftmost-longest always chooses the longest matching |
1990 | /// pattern. For example, given the patterns `a` and `ab` and the subject |
1991 | /// string `ab`, the leftmost-first match is `a` but the leftmost-longest match |
1992 | /// is `ab`. Conversely, if the patterns were given in reverse order, i.e., |
1993 | /// `ab` and `a`, then both the leftmost-first and leftmost-longest matches |
1994 | /// would be `ab`. Stated differently, the leftmost-first match depends on the |
1995 | /// order in which the patterns were given to the Aho-Corasick automaton. |
1996 | /// Because of that, when leftmost-first matching is used, if a pattern `A` |
1997 | /// that appears before a pattern `B` is a prefix of `B`, then it is impossible |
1998 | /// to ever observe a match of `B`. |
1999 | /// |
2000 | /// If you're not sure which match kind to pick, then stick with the standard |
2001 | /// kind, which is the default. In particular, if you need overlapping or |
2002 | /// streaming matches, then you _must_ use the standard kind. The leftmost |
2003 | /// kinds are useful in specific circumstances. For example, leftmost-first can |
2004 | /// be very useful as a way to implement match priority based on the order of |
2005 | /// patterns given and leftmost-longest can be useful for dictionary searching |
2006 | /// such that only the longest matching words are reported. |
2007 | /// |
2008 | /// # Relationship with regular expression alternations |
2009 | /// |
2010 | /// Understanding match semantics can be a little tricky, and one easy way |
2011 | /// to conceptualize non-overlapping matches from an Aho-Corasick automaton |
2012 | /// is to think about them as a simple alternation of literals in a regular |
2013 | /// expression. For example, let's say we wanted to match the strings |
2014 | /// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It |
2015 | /// turns out that regular expression engines have two different ways of |
2016 | /// matching this alternation. The first way, leftmost-longest, is commonly |
2017 | /// found in POSIX compatible implementations of regular expressions (such as |
2018 | /// `grep`). The second way, leftmost-first, is commonly found in backtracking |
2019 | /// implementations such as Perl. (Some regex engines, such as RE2 and Rust's |
2020 | /// regex engine do not use backtracking, but still implement leftmost-first |
2021 | /// semantics in an effort to match the behavior of dominant backtracking |
2022 | /// regex engines such as those found in Perl, Ruby, Python, Javascript and |
2023 | /// PHP.) |
2024 | /// |
2025 | /// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex |
2026 | /// will match `Samwise` because it is the longest possible match, but a |
2027 | /// Perl-like regex will match `Sam` since it appears earlier in the |
2028 | /// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine |
2029 | /// will never match `Samwise` since `Sam` will always have higher priority. |
2030 | /// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to |
2031 | /// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is |
2032 | /// still longest match, but it also appears earlier than `Sam`. |
2033 | /// |
2034 | /// The "standard" match semantics of Aho-Corasick generally don't correspond |
2035 | /// to the match semantics of any large group of regex implementations, so |
2036 | /// there's no direct analogy that can be made here. Standard match semantics |
2037 | /// are generally useful for overlapping matches, or if you just want to see |
2038 | /// matches as they are detected. |
2039 | /// |
2040 | /// The main conclusion to draw from this section is that the match semantics |
2041 | /// can be tweaked to precisely match either Perl-like regex alternations or |
2042 | /// POSIX regex alternations. |
2043 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
2044 | pub enum MatchKind { |
2045 | /// Use standard match semantics, which support overlapping matches. When |
2046 | /// used with non-overlapping matches, matches are reported as they are |
2047 | /// seen. |
2048 | Standard, |
2049 | /// Use leftmost-first match semantics, which reports leftmost matches. |
2050 | /// When there are multiple possible leftmost matches, the match |
2051 | /// corresponding to the pattern that appeared earlier when constructing |
2052 | /// the automaton is reported. |
2053 | /// |
2054 | /// This does **not** support overlapping matches or stream searching. If |
2055 | /// this match kind is used, attempting to find overlapping matches or |
2056 | /// stream matches will panic. |
2057 | LeftmostFirst, |
2058 | /// Use leftmost-longest match semantics, which reports leftmost matches. |
2059 | /// When there are multiple possible leftmost matches, the longest match |
2060 | /// is chosen. |
2061 | /// |
2062 | /// This does **not** support overlapping matches or stream searching. If |
2063 | /// this match kind is used, attempting to find overlapping matches or |
2064 | /// stream matches will panic. |
2065 | LeftmostLongest, |
2066 | /// Hints that destructuring should not be exhaustive. |
2067 | /// |
2068 | /// This enum may grow additional variants, so this makes sure clients |
2069 | /// don't count on exhaustive matching. (Otherwise, adding a new variant |
2070 | /// could break existing code.) |
2071 | #[doc (hidden)] |
2072 | __Nonexhaustive, |
2073 | } |
2074 | |
2075 | /// The default match kind is `MatchKind::Standard`. |
2076 | impl Default for MatchKind { |
2077 | fn default() -> MatchKind { |
2078 | MatchKind::Standard |
2079 | } |
2080 | } |
2081 | |
2082 | impl MatchKind { |
2083 | fn supports_overlapping(&self) -> bool { |
2084 | self.is_standard() |
2085 | } |
2086 | |
2087 | fn supports_stream(&self) -> bool { |
2088 | // TODO: It may be possible to support this. It's hard. |
2089 | // |
2090 | // See: https://github.com/rust-lang/regex/issues/425#issuecomment-471367838 |
2091 | self.is_standard() |
2092 | } |
2093 | |
2094 | pub(crate) fn is_standard(&self) -> bool { |
2095 | *self == MatchKind::Standard |
2096 | } |
2097 | |
2098 | pub(crate) fn is_leftmost(&self) -> bool { |
2099 | *self == MatchKind::LeftmostFirst |
2100 | || *self == MatchKind::LeftmostLongest |
2101 | } |
2102 | |
2103 | pub(crate) fn is_leftmost_first(&self) -> bool { |
2104 | *self == MatchKind::LeftmostFirst |
2105 | } |
2106 | |
2107 | /// Convert this match kind into a packed match kind. If this match kind |
2108 | /// corresponds to standard semantics, then this returns None, since |
2109 | /// packed searching does not support standard semantics. |
2110 | pub(crate) fn as_packed(&self) -> Option<packed::MatchKind> { |
2111 | match *self { |
2112 | MatchKind::Standard => None, |
2113 | MatchKind::LeftmostFirst => Some(packed::MatchKind::LeftmostFirst), |
2114 | MatchKind::LeftmostLongest => { |
2115 | Some(packed::MatchKind::LeftmostLongest) |
2116 | } |
2117 | MatchKind::__Nonexhaustive => unreachable!(), |
2118 | } |
2119 | } |
2120 | } |
2121 | |
2122 | #[cfg (test)] |
2123 | mod tests { |
2124 | use super::*; |
2125 | |
2126 | #[test ] |
2127 | fn oibits() { |
2128 | use std::panic::{RefUnwindSafe, UnwindSafe}; |
2129 | |
2130 | fn assert_send<T: Send>() {} |
2131 | fn assert_sync<T: Sync>() {} |
2132 | fn assert_unwind_safe<T: RefUnwindSafe + UnwindSafe>() {} |
2133 | |
2134 | assert_send::<AhoCorasick>(); |
2135 | assert_sync::<AhoCorasick>(); |
2136 | assert_unwind_safe::<AhoCorasick>(); |
2137 | assert_send::<AhoCorasickBuilder>(); |
2138 | assert_sync::<AhoCorasickBuilder>(); |
2139 | assert_unwind_safe::<AhoCorasickBuilder>(); |
2140 | } |
2141 | } |
2142 | |