| 1 | use crate::util::id::PatternID; |
| 2 | |
| 3 | /// The kind of match semantics to use for a DFA. |
| 4 | /// |
| 5 | /// The default match kind is `LeftmostFirst`. |
| 6 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
| 7 | pub enum MatchKind { |
| 8 | /// Report all possible matches. |
| 9 | All, |
| 10 | /// Report only the leftmost matches. When multiple leftmost matches exist, |
| 11 | /// report the match corresponding to the part of the regex that appears |
| 12 | /// first in the syntax. |
| 13 | LeftmostFirst, |
| 14 | /// Hints that destructuring should not be exhaustive. |
| 15 | /// |
| 16 | /// This enum may grow additional variants, so this makes sure clients |
| 17 | /// don't count on exhaustive matching. (Otherwise, adding a new variant |
| 18 | /// could break existing code.) |
| 19 | #[doc (hidden)] |
| 20 | __Nonexhaustive, |
| 21 | // There is prior art in RE2 that shows that we should be able to add |
| 22 | // LeftmostLongest too. The tricky part of it is supporting ungreedy |
| 23 | // repetitions. Instead of treating all NFA states as having equivalent |
| 24 | // priority (as in 'All') or treating all NFA states as having distinct |
| 25 | // priority based on order (as in 'LeftmostFirst'), we instead group NFA |
| 26 | // states into sets, and treat members of each set as having equivalent |
| 27 | // priority, but having greater priority than all following members |
| 28 | // of different sets. |
| 29 | // |
| 30 | // However, it's not clear whether it's really worth adding this. After |
| 31 | // all, leftmost-longest can be emulated when using literals by using |
| 32 | // leftmost-first and sorting the literals by length in descending order. |
| 33 | // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will |
| 34 | // always match `a` in `ab` when using leftmost-first, but leftmost-longest |
| 35 | // would match `ab`. |
| 36 | } |
| 37 | |
| 38 | impl MatchKind { |
| 39 | #[cfg (feature = "alloc" )] |
| 40 | pub(crate) fn continue_past_first_match(&self) -> bool { |
| 41 | *self == MatchKind::All |
| 42 | } |
| 43 | } |
| 44 | |
| 45 | impl Default for MatchKind { |
| 46 | fn default() -> MatchKind { |
| 47 | MatchKind::LeftmostFirst |
| 48 | } |
| 49 | } |
| 50 | |
| 51 | /// A representation of a match reported by a regex engine. |
| 52 | /// |
| 53 | /// A match records the start and end offsets of the match in the haystack. |
| 54 | /// |
| 55 | /// Every match guarantees that `start <= end`. |
| 56 | #[derive (Clone, Debug, Eq, Hash, PartialEq)] |
| 57 | pub struct Match { |
| 58 | /// The start offset of the match, inclusive. |
| 59 | start: usize, |
| 60 | /// The end offset of the match, exclusive. |
| 61 | end: usize, |
| 62 | } |
| 63 | |
| 64 | impl Match { |
| 65 | /// Create a new match from a byte offset span. |
| 66 | /// |
| 67 | /// # Panics |
| 68 | /// |
| 69 | /// This panics if `end < start`. |
| 70 | #[inline ] |
| 71 | pub fn new(start: usize, end: usize) -> Match { |
| 72 | assert!(start <= end); |
| 73 | Match { start, end } |
| 74 | } |
| 75 | |
| 76 | /// The starting position of the match. |
| 77 | #[inline ] |
| 78 | pub fn start(&self) -> usize { |
| 79 | self.start |
| 80 | } |
| 81 | |
| 82 | /// The ending position of the match. |
| 83 | #[inline ] |
| 84 | pub fn end(&self) -> usize { |
| 85 | self.end |
| 86 | } |
| 87 | |
| 88 | /// Returns the match location as a range. |
| 89 | #[inline ] |
| 90 | pub fn range(&self) -> core::ops::Range<usize> { |
| 91 | self.start..self.end |
| 92 | } |
| 93 | |
| 94 | /// Returns true if and only if this match is empty. That is, when |
| 95 | /// `start() == end()`. |
| 96 | /// |
| 97 | /// An empty match can only be returned when the empty string was among |
| 98 | /// the patterns used to build the Aho-Corasick automaton. |
| 99 | #[inline ] |
| 100 | pub fn is_empty(&self) -> bool { |
| 101 | self.start == self.end |
| 102 | } |
| 103 | } |
| 104 | |
| 105 | /// A representation of a match reported by a DFA. |
| 106 | /// |
| 107 | /// This is called a "half" match because it only includes the end location |
| 108 | /// (or start location for a reverse match) of a match. This corresponds to the |
| 109 | /// information that a single DFA scan can report. Getting the other half of |
| 110 | /// the match requires a second scan with a reversed DFA. |
| 111 | /// |
| 112 | /// A half match also includes the pattern that matched. The pattern is |
| 113 | /// identified by an ID, which corresponds to its position (starting from `0`) |
| 114 | /// relative to other patterns used to construct the corresponding DFA. If only |
| 115 | /// a single pattern is provided to the DFA, then all matches are guaranteed to |
| 116 | /// have a pattern ID of `0`. |
| 117 | #[derive (Clone, Copy, Debug, Eq, Hash, PartialEq)] |
| 118 | pub struct HalfMatch { |
| 119 | /// The pattern ID. |
| 120 | pub(crate) pattern: PatternID, |
| 121 | /// The offset of the match. |
| 122 | /// |
| 123 | /// For forward searches, the offset is exclusive. For reverse searches, |
| 124 | /// the offset is inclusive. |
| 125 | pub(crate) offset: usize, |
| 126 | } |
| 127 | |
| 128 | impl HalfMatch { |
| 129 | /// Create a new half match from a pattern ID and a byte offset. |
| 130 | #[inline ] |
| 131 | pub fn new(pattern: PatternID, offset: usize) -> HalfMatch { |
| 132 | HalfMatch { pattern, offset } |
| 133 | } |
| 134 | |
| 135 | /// Create a new half match from a pattern ID and a byte offset. |
| 136 | /// |
| 137 | /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a |
| 138 | /// [`PatternID`]. This panics if the given `usize` is not representable |
| 139 | /// as a `PatternID`. |
| 140 | #[inline ] |
| 141 | pub fn must(pattern: usize, offset: usize) -> HalfMatch { |
| 142 | HalfMatch::new(PatternID::new(pattern).unwrap(), offset) |
| 143 | } |
| 144 | |
| 145 | /// Returns the ID of the pattern that matched. |
| 146 | /// |
| 147 | /// The ID of a pattern is derived from the position in which it was |
| 148 | /// originally inserted into the corresponding DFA. The first pattern has |
| 149 | /// identifier `0`, and each subsequent pattern is `1`, `2` and so on. |
| 150 | #[inline ] |
| 151 | pub fn pattern(&self) -> PatternID { |
| 152 | self.pattern |
| 153 | } |
| 154 | |
| 155 | /// The position of the match. |
| 156 | /// |
| 157 | /// If this match was produced by a forward search, then the offset is |
| 158 | /// exclusive. If this match was produced by a reverse search, then the |
| 159 | /// offset is inclusive. |
| 160 | #[inline ] |
| 161 | pub fn offset(&self) -> usize { |
| 162 | self.offset |
| 163 | } |
| 164 | } |
| 165 | |
| 166 | /// A representation of a multi match reported by a regex engine. |
| 167 | /// |
| 168 | /// A multi match has two essential pieces of information: the identifier of |
| 169 | /// the pattern that matched, along with the start and end offsets of the match |
| 170 | /// in the haystack. |
| 171 | /// |
| 172 | /// The pattern is identified by an ID, which corresponds to its position |
| 173 | /// (starting from `0`) relative to other patterns used to construct the |
| 174 | /// corresponding regex engine. If only a single pattern is provided, then all |
| 175 | /// multi matches are guaranteed to have a pattern ID of `0`. |
| 176 | /// |
| 177 | /// Every multi match guarantees that `start <= end`. |
| 178 | #[derive (Clone, Debug, Eq, Hash, PartialEq)] |
| 179 | pub struct MultiMatch { |
| 180 | /// The pattern ID. |
| 181 | pattern: PatternID, |
| 182 | /// The start offset of the match, inclusive. |
| 183 | start: usize, |
| 184 | /// The end offset of the match, exclusive. |
| 185 | end: usize, |
| 186 | } |
| 187 | |
| 188 | impl MultiMatch { |
| 189 | /// Create a new match from a pattern ID and a byte offset span. |
| 190 | /// |
| 191 | /// # Panics |
| 192 | /// |
| 193 | /// This panics if `end < start`. |
| 194 | #[inline ] |
| 195 | pub fn new(pattern: PatternID, start: usize, end: usize) -> MultiMatch { |
| 196 | assert!(start <= end); |
| 197 | MultiMatch { pattern, start, end } |
| 198 | } |
| 199 | |
| 200 | /// Create a new match from a pattern ID and a byte offset span. |
| 201 | /// |
| 202 | /// This is like [`MultiMatch::new`], but accepts a `usize` instead of a |
| 203 | /// [`PatternID`]. This panics if the given `usize` is not representable |
| 204 | /// as a `PatternID`. |
| 205 | /// |
| 206 | /// # Panics |
| 207 | /// |
| 208 | /// This panics if `end < start` or if `pattern > PatternID::MAX`. |
| 209 | #[inline ] |
| 210 | pub fn must(pattern: usize, start: usize, end: usize) -> MultiMatch { |
| 211 | MultiMatch::new(PatternID::new(pattern).unwrap(), start, end) |
| 212 | } |
| 213 | |
| 214 | /// Returns the ID of the pattern that matched. |
| 215 | /// |
| 216 | /// The ID of a pattern is derived from the position in which it was |
| 217 | /// originally inserted into the corresponding regex engine. The first |
| 218 | /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and |
| 219 | /// so on. |
| 220 | #[inline ] |
| 221 | pub fn pattern(&self) -> PatternID { |
| 222 | self.pattern |
| 223 | } |
| 224 | |
| 225 | /// The starting position of the match. |
| 226 | #[inline ] |
| 227 | pub fn start(&self) -> usize { |
| 228 | self.start |
| 229 | } |
| 230 | |
| 231 | /// The ending position of the match. |
| 232 | #[inline ] |
| 233 | pub fn end(&self) -> usize { |
| 234 | self.end |
| 235 | } |
| 236 | |
| 237 | /// Returns the match location as a range. |
| 238 | #[inline ] |
| 239 | pub fn range(&self) -> core::ops::Range<usize> { |
| 240 | self.start..self.end |
| 241 | } |
| 242 | |
| 243 | /// Returns true if and only if this match is empty. That is, when |
| 244 | /// `start() == end()`. |
| 245 | /// |
| 246 | /// An empty match can only be returned when the empty string was among |
| 247 | /// the patterns used to build the Aho-Corasick automaton. |
| 248 | #[inline ] |
| 249 | pub fn is_empty(&self) -> bool { |
| 250 | self.start == self.end |
| 251 | } |
| 252 | } |
| 253 | |
| 254 | /// An error type indicating that a search stopped prematurely without finding |
| 255 | /// a match. |
| 256 | /// |
| 257 | /// This error type implies that one cannot assume that no matches occur, since |
| 258 | /// the search stopped before completing. |
| 259 | /// |
| 260 | /// Normally, when one searches for something, the response is either an |
| 261 | /// affirmative "it was found at this location" or a negative "not found at |
| 262 | /// all." However, in some cases, a regex engine can be configured to stop its |
| 263 | /// search before concluding whether a match exists or not. When this happens, |
| 264 | /// it may be important for the caller to know why the regex engine gave up and |
| 265 | /// where in the input it gave up at. This error type exposes the 'why' and the |
| 266 | /// 'where.' |
| 267 | /// |
| 268 | /// For example, the DFAs provided by this library generally cannot correctly |
| 269 | /// implement Unicode word boundaries. Instead, they provide an option to |
| 270 | /// eagerly support them on ASCII text (since Unicode word boundaries are |
| 271 | /// equivalent to ASCII word boundaries when searching ASCII text), but will |
| 272 | /// "give up" if a non-ASCII byte is seen. In such cases, one is usually |
| 273 | /// required to either report the failure to the caller (unergonomic) or |
| 274 | /// otherwise fall back to some other regex engine (ergonomic, but potentially |
| 275 | /// costly). |
| 276 | /// |
| 277 | /// More generally, some regex engines offer the ability for callers to specify |
| 278 | /// certain bytes that will trigger the regex engine to automatically quit if |
| 279 | /// they are seen. |
| 280 | /// |
| 281 | /// Still yet, there may be other reasons for a failed match. For example, |
| 282 | /// the hybrid DFA provided by this crate can be configured to give up if it |
| 283 | /// believes that it is not efficient. This in turn permits callers to choose a |
| 284 | /// different regex engine. |
| 285 | /// |
| 286 | /// # Advice |
| 287 | /// |
| 288 | /// While this form of error reporting adds complexity, it is generally |
| 289 | /// possible for callers to configure regex engines to never give up a search, |
| 290 | /// and thus never return an error. Indeed, the default configuration for every |
| 291 | /// regex engine in this crate is such that they will never stop searching |
| 292 | /// early. Therefore, the only way to get a match error is if the regex engine |
| 293 | /// is explicitly configured to do so. Options that enable this behavior |
| 294 | /// document the new error conditions they imply. |
| 295 | /// |
| 296 | /// Regex engines for which no errors are possible for any configuration will |
| 297 | /// return the normal `Option<Match>` and not use this error type at all. |
| 298 | /// |
| 299 | /// For example, regex engines in the `dfa` sub-module will only report |
| 300 | /// `MatchError::Quit` if instructed by either |
| 301 | /// [enabling Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary) |
| 302 | /// or by |
| 303 | /// [explicitly specifying one or more quit bytes](crate::dfa::dense::Config::quit). |
| 304 | #[derive (Clone, Debug, Eq, Hash, PartialEq)] |
| 305 | pub enum MatchError { |
| 306 | // Note that the first version of this type was called `SearchError` and it |
| 307 | // included a third `None` variant to indicate that the search completed |
| 308 | // and no match was found. However, this was problematic for iterator |
| 309 | // APIs where the `None` sentinel for stopping iteration corresponds |
| 310 | // precisely to the "match not found" case. The fact that the `None` |
| 311 | // variant was buried inside this type was in turn quite awkward. So |
| 312 | // instead, I removed the `None` variant, renamed the type and used |
| 313 | // `Result<Option<Match>, MatchError>` in non-iterator APIs instead of the |
| 314 | // conceptually simpler `Result<Match, MatchError>`. However, we "regain" |
| 315 | // ergonomics by only putting the more complex API in the `try_` variants |
| 316 | // ("fallible") of search methods. The infallible APIs will instead just |
| 317 | // return `Option<Match>` and panic on error. |
| 318 | /// The search saw a "quit" byte at which it was instructed to stop |
| 319 | /// searching. |
| 320 | Quit { |
| 321 | /// The "quit" byte that was observed that caused the search to stop. |
| 322 | byte: u8, |
| 323 | /// The offset at which the quit byte was observed. |
| 324 | offset: usize, |
| 325 | }, |
| 326 | /// The search, based on heuristics, determined that it would be better |
| 327 | /// to stop, typically to provide the caller an opportunity to use an |
| 328 | /// alternative regex engine. |
| 329 | /// |
| 330 | /// Currently, the only way for this to occur is via the lazy DFA and |
| 331 | /// only when it is configured to do so (it will not return this error by |
| 332 | /// default). |
| 333 | GaveUp { |
| 334 | /// The offset at which the search stopped. This corresponds to the |
| 335 | /// position immediately following the last byte scanned. |
| 336 | offset: usize, |
| 337 | }, |
| 338 | } |
| 339 | |
| 340 | #[cfg (feature = "std" )] |
| 341 | impl std::error::Error for MatchError {} |
| 342 | |
| 343 | impl core::fmt::Display for MatchError { |
| 344 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
| 345 | match *self { |
| 346 | MatchError::Quit { byte: u8, offset: usize } => write!( |
| 347 | f, |
| 348 | "quit search after observing byte \\x {:02X} at offset {}" , |
| 349 | byte, offset, |
| 350 | ), |
| 351 | MatchError::GaveUp { offset: usize } => { |
| 352 | write!(f, "gave up searching at offset {}" , offset) |
| 353 | } |
| 354 | } |
| 355 | } |
| 356 | } |
| 357 | |