| 1 | /*! |
| 2 | A library for finding occurrences of many patterns at once. This library |
| 3 | provides multiple pattern search principally through an implementation of the |
| 4 | [Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm), |
| 5 | which builds a fast finite state machine for executing searches in linear time. |
| 6 | |
| 7 | Additionally, this library provides a number of configuration options for |
| 8 | building the automaton that permit controlling the space versus time trade |
| 9 | off. Other features include simple ASCII case insensitive matching, finding |
| 10 | overlapping matches, replacements, searching streams and even searching and |
| 11 | replacing text in streams. |
| 12 | |
| 13 | Finally, unlike most other Aho-Corasick implementations, this one |
| 14 | supports enabling [leftmost-first](MatchKind::LeftmostFirst) or |
| 15 | [leftmost-longest](MatchKind::LeftmostLongest) match semantics, using a |
| 16 | (seemingly) novel alternative construction algorithm. For more details on what |
| 17 | match semantics means, see the [`MatchKind`] type. |
| 18 | |
| 19 | # Overview |
| 20 | |
| 21 | This section gives a brief overview of the primary types in this crate: |
| 22 | |
| 23 | * [`AhoCorasick`] is the primary type and represents an Aho-Corasick automaton. |
| 24 | This is the type you use to execute searches. |
| 25 | * [`AhoCorasickBuilder`] can be used to build an Aho-Corasick automaton, and |
| 26 | supports configuring a number of options. |
| 27 | * [`Match`] represents a single match reported by an Aho-Corasick automaton. |
| 28 | Each match has two pieces of information: the pattern that matched and the |
| 29 | start and end byte offsets corresponding to the position in the haystack at |
| 30 | which it matched. |
| 31 | |
| 32 | # Example: basic searching |
| 33 | |
| 34 | This example shows how to search for occurrences of multiple patterns |
| 35 | simultaneously. Each match includes the pattern that matched along with the |
| 36 | byte offsets of the match. |
| 37 | |
| 38 | ``` |
| 39 | use aho_corasick::{AhoCorasick, PatternID}; |
| 40 | |
| 41 | let patterns = &["apple" , "maple" , "Snapple" ]; |
| 42 | let haystack = "Nobody likes maple in their apple flavored Snapple." ; |
| 43 | |
| 44 | let ac = AhoCorasick::new(patterns).unwrap(); |
| 45 | let mut matches = vec![]; |
| 46 | for mat in ac.find_iter(haystack) { |
| 47 | matches.push((mat.pattern(), mat.start(), mat.end())); |
| 48 | } |
| 49 | assert_eq!(matches, vec![ |
| 50 | (PatternID::must(1), 13, 18), |
| 51 | (PatternID::must(0), 28, 33), |
| 52 | (PatternID::must(2), 43, 50), |
| 53 | ]); |
| 54 | ``` |
| 55 | |
| 56 | # Example: case insensitivity |
| 57 | |
| 58 | This is like the previous example, but matches `Snapple` case insensitively |
| 59 | using `AhoCorasickBuilder`: |
| 60 | |
| 61 | ``` |
| 62 | use aho_corasick::{AhoCorasick, PatternID}; |
| 63 | |
| 64 | let patterns = &["apple" , "maple" , "snapple" ]; |
| 65 | let haystack = "Nobody likes maple in their apple flavored Snapple." ; |
| 66 | |
| 67 | let ac = AhoCorasick::builder() |
| 68 | .ascii_case_insensitive(true) |
| 69 | .build(patterns) |
| 70 | .unwrap(); |
| 71 | let mut matches = vec![]; |
| 72 | for mat in ac.find_iter(haystack) { |
| 73 | matches.push((mat.pattern(), mat.start(), mat.end())); |
| 74 | } |
| 75 | assert_eq!(matches, vec![ |
| 76 | (PatternID::must(1), 13, 18), |
| 77 | (PatternID::must(0), 28, 33), |
| 78 | (PatternID::must(2), 43, 50), |
| 79 | ]); |
| 80 | ``` |
| 81 | |
| 82 | # Example: replacing matches in a stream |
| 83 | |
| 84 | This example shows how to execute a search and replace on a stream without |
| 85 | loading the entire stream into memory first. |
| 86 | |
| 87 | ``` |
| 88 | # #[cfg (feature = "std" )] { |
| 89 | use aho_corasick::AhoCorasick; |
| 90 | |
| 91 | # fn example() -> Result<(), std::io::Error> { |
| 92 | let patterns = &["fox" , "brown" , "quick" ]; |
| 93 | let replace_with = &["sloth" , "grey" , "slow" ]; |
| 94 | |
| 95 | // In a real example, these might be `std::fs::File`s instead. All you need to |
| 96 | // do is supply a pair of `std::io::Read` and `std::io::Write` implementations. |
| 97 | let rdr = "The quick brown fox." ; |
| 98 | let mut wtr = vec![]; |
| 99 | |
| 100 | let ac = AhoCorasick::new(patterns).unwrap(); |
| 101 | ac.try_stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)?; |
| 102 | assert_eq!(b"The slow grey sloth." .to_vec(), wtr); |
| 103 | # Ok(()) }; example().unwrap() |
| 104 | # } |
| 105 | ``` |
| 106 | |
| 107 | # Example: finding the leftmost first match |
| 108 | |
| 109 | In the textbook description of Aho-Corasick, its formulation is typically |
| 110 | structured such that it reports all possible matches, even when they overlap |
| 111 | with another. In many cases, overlapping matches may not be desired, such as |
| 112 | the case of finding all successive non-overlapping matches like you might with |
| 113 | a standard regular expression. |
| 114 | |
| 115 | Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do |
| 116 | this doesn't always work in the expected way, since it will report matches as |
| 117 | soon as they are seen. For example, consider matching the regex `Samwise|Sam` |
| 118 | against the text `Samwise`. Most regex engines (that are Perl-like, or |
| 119 | non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick |
| 120 | algorithm modified for reporting non-overlapping matches will report `Sam`. |
| 121 | |
| 122 | A novel contribution of this library is the ability to change the match |
| 123 | semantics of Aho-Corasick (without additional search time overhead) such that |
| 124 | `Samwise` is reported instead. For example, here's the standard approach: |
| 125 | |
| 126 | ``` |
| 127 | use aho_corasick::AhoCorasick; |
| 128 | |
| 129 | let patterns = &["Samwise" , "Sam" ]; |
| 130 | let haystack = "Samwise" ; |
| 131 | |
| 132 | let ac = AhoCorasick::new(patterns).unwrap(); |
| 133 | let mat = ac.find(haystack).expect("should have a match" ); |
| 134 | assert_eq!("Sam" , &haystack[mat.start()..mat.end()]); |
| 135 | ``` |
| 136 | |
| 137 | And now here's the leftmost-first version, which matches how a Perl-like |
| 138 | regex will work: |
| 139 | |
| 140 | ``` |
| 141 | use aho_corasick::{AhoCorasick, MatchKind}; |
| 142 | |
| 143 | let patterns = &["Samwise" , "Sam" ]; |
| 144 | let haystack = "Samwise" ; |
| 145 | |
| 146 | let ac = AhoCorasick::builder() |
| 147 | .match_kind(MatchKind::LeftmostFirst) |
| 148 | .build(patterns) |
| 149 | .unwrap(); |
| 150 | let mat = ac.find(haystack).expect("should have a match" ); |
| 151 | assert_eq!("Samwise" , &haystack[mat.start()..mat.end()]); |
| 152 | ``` |
| 153 | |
| 154 | In addition to leftmost-first semantics, this library also supports |
| 155 | leftmost-longest semantics, which match the POSIX behavior of a regular |
| 156 | expression alternation. See [`MatchKind`] for more details. |
| 157 | |
| 158 | # Prefilters |
| 159 | |
| 160 | While an Aho-Corasick automaton can perform admirably when compared to more |
| 161 | naive solutions, it is generally slower than more specialized algorithms that |
| 162 | are accelerated using vector instructions such as SIMD. |
| 163 | |
| 164 | For that reason, this library will internally use a "prefilter" to attempt |
| 165 | to accelerate searches when possible. Currently, this library has several |
| 166 | different algorithms it might use depending on the patterns provided. Once the |
| 167 | number of patterns gets too big, prefilters are no longer used. |
| 168 | |
| 169 | While a prefilter is generally good to have on by default since it works |
| 170 | well in the common case, it can lead to less predictable or even sub-optimal |
| 171 | performance in some cases. For that reason, prefilters can be explicitly |
| 172 | disabled via [`AhoCorasickBuilder::prefilter`]. |
| 173 | |
| 174 | # Lower level APIs |
| 175 | |
| 176 | This crate also provides several sub-modules that collectively expose many of |
| 177 | the implementation details of the main [`AhoCorasick`] type. Most users of this |
| 178 | library can completely ignore the submodules and their contents, but if you |
| 179 | needed finer grained control, some parts of them may be useful to you. Here is |
| 180 | a brief overview of each and why you might want to use them: |
| 181 | |
| 182 | * The [`packed`] sub-module contains a lower level API for using fast |
| 183 | vectorized routines for finding a small number of patterns in a haystack. |
| 184 | You might want to use this API when you want to completely side-step using |
| 185 | Aho-Corasick automata. Otherwise, the fast vectorized routines are used |
| 186 | automatically as prefilters for `AhoCorasick` searches whenever possible. |
| 187 | * The [`automaton`] sub-module provides a lower level finite state |
| 188 | machine interface that the various Aho-Corasick implementations in |
| 189 | this crate implement. This sub-module's main contribution is the |
| 190 | [`Automaton`](automaton::Automaton) trait, which permits manually walking the |
| 191 | state transitions of an Aho-Corasick automaton. |
| 192 | * The [`dfa`] and [`nfa`] sub-modules provide DFA and NFA implementations of |
| 193 | the aforementioned `Automaton` trait. The main reason one might want to use |
| 194 | these sub-modules is to get access to a type that implements the `Automaton` |
| 195 | trait. (The top-level `AhoCorasick` type does not implement the `Automaton` |
| 196 | trait.) |
| 197 | |
| 198 | As mentioned above, if you aren't sure whether you need these sub-modules, |
| 199 | you should be able to safely ignore them and just focus on the [`AhoCorasick`] |
| 200 | type. |
| 201 | |
| 202 | # Crate features |
| 203 | |
| 204 | This crate exposes a few features for controlling dependency usage and whether |
| 205 | this crate can be used without the standard library. |
| 206 | |
| 207 | * **std** - |
| 208 | Enables support for the standard library. This feature is enabled by |
| 209 | default. When disabled, only `core` and `alloc` are used. At an API |
| 210 | level, enabling `std` enables `std::error::Error` trait impls for the |
| 211 | various error types, and higher level stream search routines such as |
| 212 | [`AhoCorasick::try_stream_find_iter`]. But the `std` feature is also required |
| 213 | to enable vectorized prefilters. Prefilters can greatly accelerate searches, |
| 214 | but generally only apply when the number of patterns is small (less than |
| 215 | ~100). |
| 216 | * **perf-literal** - |
| 217 | Enables support for literal prefilters that use vectorized routines from |
| 218 | external crates. This feature is enabled by default. If you're only using |
| 219 | Aho-Corasick for large numbers of patterns or otherwise can abide lower |
| 220 | throughput when searching with a small number of patterns, then it is |
| 221 | reasonable to disable this feature. |
| 222 | * **logging** - |
| 223 | Enables a dependency on the `log` crate and emits messages to aide in |
| 224 | diagnostics. This feature is disabled by default. |
| 225 | */ |
| 226 | |
| 227 | #![no_std ] |
| 228 | #![deny (missing_docs)] |
| 229 | #![deny (rustdoc::broken_intra_doc_links)] |
| 230 | #![cfg_attr (docsrs, feature(doc_auto_cfg))] |
| 231 | |
| 232 | extern crate alloc; |
| 233 | #[cfg (any(test, feature = "std" ))] |
| 234 | extern crate std; |
| 235 | |
| 236 | #[cfg (doctest)] |
| 237 | doc_comment::doctest!("../README.md" ); |
| 238 | |
| 239 | #[cfg (feature = "std" )] |
| 240 | pub use crate::ahocorasick::StreamFindIter; |
| 241 | pub use crate::{ |
| 242 | ahocorasick::{ |
| 243 | AhoCorasick, AhoCorasickBuilder, AhoCorasickKind, FindIter, |
| 244 | FindOverlappingIter, |
| 245 | }, |
| 246 | util::{ |
| 247 | error::{BuildError, MatchError, MatchErrorKind}, |
| 248 | primitives::{PatternID, PatternIDError}, |
| 249 | search::{Anchored, Input, Match, MatchKind, Span, StartKind}, |
| 250 | }, |
| 251 | }; |
| 252 | |
| 253 | #[macro_use ] |
| 254 | mod macros; |
| 255 | |
| 256 | mod ahocorasick; |
| 257 | pub mod automaton; |
| 258 | pub mod dfa; |
| 259 | pub mod nfa; |
| 260 | pub mod packed; |
| 261 | #[cfg (test)] |
| 262 | mod tests; |
| 263 | // I wrote out the module for implementing fst::Automaton only to later realize |
| 264 | // that this would make fst a public dependency and fst is not at 1.0 yet. I |
| 265 | // decided to just keep the code in tree, but build it only during tests. |
| 266 | // |
| 267 | // TODO: I think I've changed my mind again. I'm considering pushing it out |
| 268 | // into either a separate crate or into 'fst' directly as an optional feature. |
| 269 | // #[cfg(test)] |
| 270 | // #[allow(dead_code)] |
| 271 | // mod transducer; |
| 272 | pub(crate) mod util; |
| 273 | |
| 274 | #[cfg (test)] |
| 275 | mod testoibits { |
| 276 | use std::panic::{RefUnwindSafe, UnwindSafe}; |
| 277 | |
| 278 | use super::*; |
| 279 | |
| 280 | fn assert_all<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {} |
| 281 | |
| 282 | #[test ] |
| 283 | fn oibits_main() { |
| 284 | assert_all::<AhoCorasick>(); |
| 285 | assert_all::<AhoCorasickBuilder>(); |
| 286 | assert_all::<AhoCorasickKind>(); |
| 287 | assert_all::<FindIter>(); |
| 288 | assert_all::<FindOverlappingIter>(); |
| 289 | |
| 290 | assert_all::<BuildError>(); |
| 291 | assert_all::<MatchError>(); |
| 292 | assert_all::<MatchErrorKind>(); |
| 293 | |
| 294 | assert_all::<Anchored>(); |
| 295 | assert_all::<Input>(); |
| 296 | assert_all::<Match>(); |
| 297 | assert_all::<MatchKind>(); |
| 298 | assert_all::<Span>(); |
| 299 | assert_all::<StartKind>(); |
| 300 | } |
| 301 | |
| 302 | #[test ] |
| 303 | fn oibits_automaton() { |
| 304 | use crate::{automaton, dfa::DFA}; |
| 305 | |
| 306 | assert_all::<automaton::FindIter<DFA>>(); |
| 307 | assert_all::<automaton::FindOverlappingIter<DFA>>(); |
| 308 | #[cfg (feature = "std" )] |
| 309 | assert_all::<automaton::StreamFindIter<DFA, std::io::Stdin>>(); |
| 310 | assert_all::<automaton::OverlappingState>(); |
| 311 | |
| 312 | assert_all::<automaton::Prefilter>(); |
| 313 | assert_all::<automaton::Candidate>(); |
| 314 | } |
| 315 | |
| 316 | #[test ] |
| 317 | fn oibits_packed() { |
| 318 | use crate::packed; |
| 319 | |
| 320 | assert_all::<packed::Config>(); |
| 321 | assert_all::<packed::Builder>(); |
| 322 | assert_all::<packed::Searcher>(); |
| 323 | assert_all::<packed::FindIter>(); |
| 324 | assert_all::<packed::MatchKind>(); |
| 325 | } |
| 326 | } |
| 327 | |