1use crate::util::id::PatternID;
2
3/// The kind of match semantics to use for a DFA.
4///
5/// The default match kind is `LeftmostFirst`.
6#[derive(Clone, Copy, Debug, Eq, PartialEq)]
7pub enum MatchKind {
8 /// Report all possible matches.
9 All,
10 /// Report only the leftmost matches. When multiple leftmost matches exist,
11 /// report the match corresponding to the part of the regex that appears
12 /// first in the syntax.
13 LeftmostFirst,
14 /// Hints that destructuring should not be exhaustive.
15 ///
16 /// This enum may grow additional variants, so this makes sure clients
17 /// don't count on exhaustive matching. (Otherwise, adding a new variant
18 /// could break existing code.)
19 #[doc(hidden)]
20 __Nonexhaustive,
21 // There is prior art in RE2 that shows that we should be able to add
22 // LeftmostLongest too. The tricky part of it is supporting ungreedy
23 // repetitions. Instead of treating all NFA states as having equivalent
24 // priority (as in 'All') or treating all NFA states as having distinct
25 // priority based on order (as in 'LeftmostFirst'), we instead group NFA
26 // states into sets, and treat members of each set as having equivalent
27 // priority, but having greater priority than all following members
28 // of different sets.
29 //
30 // However, it's not clear whether it's really worth adding this. After
31 // all, leftmost-longest can be emulated when using literals by using
32 // leftmost-first and sorting the literals by length in descending order.
33 // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will
34 // always match `a` in `ab` when using leftmost-first, but leftmost-longest
35 // would match `ab`.
36}
37
38impl MatchKind {
39 #[cfg(feature = "alloc")]
40 pub(crate) fn continue_past_first_match(&self) -> bool {
41 *self == MatchKind::All
42 }
43}
44
45impl Default for MatchKind {
46 fn default() -> MatchKind {
47 MatchKind::LeftmostFirst
48 }
49}
50
51/// A representation of a match reported by a regex engine.
52///
53/// A match records the start and end offsets of the match in the haystack.
54///
55/// Every match guarantees that `start <= end`.
56#[derive(Clone, Debug, Eq, Hash, PartialEq)]
57pub struct Match {
58 /// The start offset of the match, inclusive.
59 start: usize,
60 /// The end offset of the match, exclusive.
61 end: usize,
62}
63
64impl Match {
65 /// Create a new match from a byte offset span.
66 ///
67 /// # Panics
68 ///
69 /// This panics if `end < start`.
70 #[inline]
71 pub fn new(start: usize, end: usize) -> Match {
72 assert!(start <= end);
73 Match { start, end }
74 }
75
76 /// The starting position of the match.
77 #[inline]
78 pub fn start(&self) -> usize {
79 self.start
80 }
81
82 /// The ending position of the match.
83 #[inline]
84 pub fn end(&self) -> usize {
85 self.end
86 }
87
88 /// Returns the match location as a range.
89 #[inline]
90 pub fn range(&self) -> core::ops::Range<usize> {
91 self.start..self.end
92 }
93
94 /// Returns true if and only if this match is empty. That is, when
95 /// `start() == end()`.
96 ///
97 /// An empty match can only be returned when the empty string was among
98 /// the patterns used to build the Aho-Corasick automaton.
99 #[inline]
100 pub fn is_empty(&self) -> bool {
101 self.start == self.end
102 }
103}
104
105/// A representation of a match reported by a DFA.
106///
107/// This is called a "half" match because it only includes the end location
108/// (or start location for a reverse match) of a match. This corresponds to the
109/// information that a single DFA scan can report. Getting the other half of
110/// the match requires a second scan with a reversed DFA.
111///
112/// A half match also includes the pattern that matched. The pattern is
113/// identified by an ID, which corresponds to its position (starting from `0`)
114/// relative to other patterns used to construct the corresponding DFA. If only
115/// a single pattern is provided to the DFA, then all matches are guaranteed to
116/// have a pattern ID of `0`.
117#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
118pub struct HalfMatch {
119 /// The pattern ID.
120 pub(crate) pattern: PatternID,
121 /// The offset of the match.
122 ///
123 /// For forward searches, the offset is exclusive. For reverse searches,
124 /// the offset is inclusive.
125 pub(crate) offset: usize,
126}
127
128impl HalfMatch {
129 /// Create a new half match from a pattern ID and a byte offset.
130 #[inline]
131 pub fn new(pattern: PatternID, offset: usize) -> HalfMatch {
132 HalfMatch { pattern, offset }
133 }
134
135 /// Create a new half match from a pattern ID and a byte offset.
136 ///
137 /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a
138 /// [`PatternID`]. This panics if the given `usize` is not representable
139 /// as a `PatternID`.
140 #[inline]
141 pub fn must(pattern: usize, offset: usize) -> HalfMatch {
142 HalfMatch::new(PatternID::new(pattern).unwrap(), offset)
143 }
144
145 /// Returns the ID of the pattern that matched.
146 ///
147 /// The ID of a pattern is derived from the position in which it was
148 /// originally inserted into the corresponding DFA. The first pattern has
149 /// identifier `0`, and each subsequent pattern is `1`, `2` and so on.
150 #[inline]
151 pub fn pattern(&self) -> PatternID {
152 self.pattern
153 }
154
155 /// The position of the match.
156 ///
157 /// If this match was produced by a forward search, then the offset is
158 /// exclusive. If this match was produced by a reverse search, then the
159 /// offset is inclusive.
160 #[inline]
161 pub fn offset(&self) -> usize {
162 self.offset
163 }
164}
165
166/// A representation of a multi match reported by a regex engine.
167///
168/// A multi match has two essential pieces of information: the identifier of
169/// the pattern that matched, along with the start and end offsets of the match
170/// in the haystack.
171///
172/// The pattern is identified by an ID, which corresponds to its position
173/// (starting from `0`) relative to other patterns used to construct the
174/// corresponding regex engine. If only a single pattern is provided, then all
175/// multi matches are guaranteed to have a pattern ID of `0`.
176///
177/// Every multi match guarantees that `start <= end`.
178#[derive(Clone, Debug, Eq, Hash, PartialEq)]
179pub struct MultiMatch {
180 /// The pattern ID.
181 pattern: PatternID,
182 /// The start offset of the match, inclusive.
183 start: usize,
184 /// The end offset of the match, exclusive.
185 end: usize,
186}
187
188impl MultiMatch {
189 /// Create a new match from a pattern ID and a byte offset span.
190 ///
191 /// # Panics
192 ///
193 /// This panics if `end < start`.
194 #[inline]
195 pub fn new(pattern: PatternID, start: usize, end: usize) -> MultiMatch {
196 assert!(start <= end);
197 MultiMatch { pattern, start, end }
198 }
199
200 /// Create a new match from a pattern ID and a byte offset span.
201 ///
202 /// This is like [`MultiMatch::new`], but accepts a `usize` instead of a
203 /// [`PatternID`]. This panics if the given `usize` is not representable
204 /// as a `PatternID`.
205 ///
206 /// # Panics
207 ///
208 /// This panics if `end < start` or if `pattern > PatternID::MAX`.
209 #[inline]
210 pub fn must(pattern: usize, start: usize, end: usize) -> MultiMatch {
211 MultiMatch::new(PatternID::new(pattern).unwrap(), start, end)
212 }
213
214 /// Returns the ID of the pattern that matched.
215 ///
216 /// The ID of a pattern is derived from the position in which it was
217 /// originally inserted into the corresponding regex engine. The first
218 /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and
219 /// so on.
220 #[inline]
221 pub fn pattern(&self) -> PatternID {
222 self.pattern
223 }
224
225 /// The starting position of the match.
226 #[inline]
227 pub fn start(&self) -> usize {
228 self.start
229 }
230
231 /// The ending position of the match.
232 #[inline]
233 pub fn end(&self) -> usize {
234 self.end
235 }
236
237 /// Returns the match location as a range.
238 #[inline]
239 pub fn range(&self) -> core::ops::Range<usize> {
240 self.start..self.end
241 }
242
243 /// Returns true if and only if this match is empty. That is, when
244 /// `start() == end()`.
245 ///
246 /// An empty match can only be returned when the empty string was among
247 /// the patterns used to build the Aho-Corasick automaton.
248 #[inline]
249 pub fn is_empty(&self) -> bool {
250 self.start == self.end
251 }
252}
253
254/// An error type indicating that a search stopped prematurely without finding
255/// a match.
256///
257/// This error type implies that one cannot assume that no matches occur, since
258/// the search stopped before completing.
259///
260/// Normally, when one searches for something, the response is either an
261/// affirmative "it was found at this location" or a negative "not found at
262/// all." However, in some cases, a regex engine can be configured to stop its
263/// search before concluding whether a match exists or not. When this happens,
264/// it may be important for the caller to know why the regex engine gave up and
265/// where in the input it gave up at. This error type exposes the 'why' and the
266/// 'where.'
267///
268/// For example, the DFAs provided by this library generally cannot correctly
269/// implement Unicode word boundaries. Instead, they provide an option to
270/// eagerly support them on ASCII text (since Unicode word boundaries are
271/// equivalent to ASCII word boundaries when searching ASCII text), but will
272/// "give up" if a non-ASCII byte is seen. In such cases, one is usually
273/// required to either report the failure to the caller (unergonomic) or
274/// otherwise fall back to some other regex engine (ergonomic, but potentially
275/// costly).
276///
277/// More generally, some regex engines offer the ability for callers to specify
278/// certain bytes that will trigger the regex engine to automatically quit if
279/// they are seen.
280///
281/// Still yet, there may be other reasons for a failed match. For example,
282/// the hybrid DFA provided by this crate can be configured to give up if it
283/// believes that it is not efficient. This in turn permits callers to choose a
284/// different regex engine.
285///
286/// # Advice
287///
288/// While this form of error reporting adds complexity, it is generally
289/// possible for callers to configure regex engines to never give up a search,
290/// and thus never return an error. Indeed, the default configuration for every
291/// regex engine in this crate is such that they will never stop searching
292/// early. Therefore, the only way to get a match error is if the regex engine
293/// is explicitly configured to do so. Options that enable this behavior
294/// document the new error conditions they imply.
295///
296/// Regex engines for which no errors are possible for any configuration will
297/// return the normal `Option<Match>` and not use this error type at all.
298///
299/// For example, regex engines in the `dfa` sub-module will only report
300/// `MatchError::Quit` if instructed by either
301/// [enabling Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary)
302/// or by
303/// [explicitly specifying one or more quit bytes](crate::dfa::dense::Config::quit).
304#[derive(Clone, Debug, Eq, Hash, PartialEq)]
305pub enum MatchError {
306 // Note that the first version of this type was called `SearchError` and it
307 // included a third `None` variant to indicate that the search completed
308 // and no match was found. However, this was problematic for iterator
309 // APIs where the `None` sentinel for stopping iteration corresponds
310 // precisely to the "match not found" case. The fact that the `None`
311 // variant was buried inside this type was in turn quite awkward. So
312 // instead, I removed the `None` variant, renamed the type and used
313 // `Result<Option<Match>, MatchError>` in non-iterator APIs instead of the
314 // conceptually simpler `Result<Match, MatchError>`. However, we "regain"
315 // ergonomics by only putting the more complex API in the `try_` variants
316 // ("fallible") of search methods. The infallible APIs will instead just
317 // return `Option<Match>` and panic on error.
318 /// The search saw a "quit" byte at which it was instructed to stop
319 /// searching.
320 Quit {
321 /// The "quit" byte that was observed that caused the search to stop.
322 byte: u8,
323 /// The offset at which the quit byte was observed.
324 offset: usize,
325 },
326 /// The search, based on heuristics, determined that it would be better
327 /// to stop, typically to provide the caller an opportunity to use an
328 /// alternative regex engine.
329 ///
330 /// Currently, the only way for this to occur is via the lazy DFA and
331 /// only when it is configured to do so (it will not return this error by
332 /// default).
333 GaveUp {
334 /// The offset at which the search stopped. This corresponds to the
335 /// position immediately following the last byte scanned.
336 offset: usize,
337 },
338}
339
340#[cfg(feature = "std")]
341impl std::error::Error for MatchError {}
342
343impl core::fmt::Display for MatchError {
344 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
345 match *self {
346 MatchError::Quit { byte: u8, offset: usize } => write!(
347 f,
348 "quit search after observing byte \\x{:02X} at offset {}",
349 byte, offset,
350 ),
351 MatchError::GaveUp { offset: usize } => {
352 write!(f, "gave up searching at offset {}", offset)
353 }
354 }
355 }
356}
357