1 | use crate::util::id::PatternID; |
2 | |
3 | /// The kind of match semantics to use for a DFA. |
4 | /// |
5 | /// The default match kind is `LeftmostFirst`. |
6 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
7 | pub enum MatchKind { |
8 | /// Report all possible matches. |
9 | All, |
10 | /// Report only the leftmost matches. When multiple leftmost matches exist, |
11 | /// report the match corresponding to the part of the regex that appears |
12 | /// first in the syntax. |
13 | LeftmostFirst, |
14 | /// Hints that destructuring should not be exhaustive. |
15 | /// |
16 | /// This enum may grow additional variants, so this makes sure clients |
17 | /// don't count on exhaustive matching. (Otherwise, adding a new variant |
18 | /// could break existing code.) |
19 | #[doc (hidden)] |
20 | __Nonexhaustive, |
21 | // There is prior art in RE2 that shows that we should be able to add |
22 | // LeftmostLongest too. The tricky part of it is supporting ungreedy |
23 | // repetitions. Instead of treating all NFA states as having equivalent |
24 | // priority (as in 'All') or treating all NFA states as having distinct |
25 | // priority based on order (as in 'LeftmostFirst'), we instead group NFA |
26 | // states into sets, and treat members of each set as having equivalent |
27 | // priority, but having greater priority than all following members |
28 | // of different sets. |
29 | // |
30 | // However, it's not clear whether it's really worth adding this. After |
31 | // all, leftmost-longest can be emulated when using literals by using |
32 | // leftmost-first and sorting the literals by length in descending order. |
33 | // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will |
34 | // always match `a` in `ab` when using leftmost-first, but leftmost-longest |
35 | // would match `ab`. |
36 | } |
37 | |
38 | impl MatchKind { |
39 | #[cfg (feature = "alloc" )] |
40 | pub(crate) fn continue_past_first_match(&self) -> bool { |
41 | *self == MatchKind::All |
42 | } |
43 | } |
44 | |
45 | impl Default for MatchKind { |
46 | fn default() -> MatchKind { |
47 | MatchKind::LeftmostFirst |
48 | } |
49 | } |
50 | |
51 | /// A representation of a match reported by a regex engine. |
52 | /// |
53 | /// A match records the start and end offsets of the match in the haystack. |
54 | /// |
55 | /// Every match guarantees that `start <= end`. |
56 | #[derive (Clone, Debug, Eq, Hash, PartialEq)] |
57 | pub struct Match { |
58 | /// The start offset of the match, inclusive. |
59 | start: usize, |
60 | /// The end offset of the match, exclusive. |
61 | end: usize, |
62 | } |
63 | |
64 | impl Match { |
65 | /// Create a new match from a byte offset span. |
66 | /// |
67 | /// # Panics |
68 | /// |
69 | /// This panics if `end < start`. |
70 | #[inline ] |
71 | pub fn new(start: usize, end: usize) -> Match { |
72 | assert!(start <= end); |
73 | Match { start, end } |
74 | } |
75 | |
76 | /// The starting position of the match. |
77 | #[inline ] |
78 | pub fn start(&self) -> usize { |
79 | self.start |
80 | } |
81 | |
82 | /// The ending position of the match. |
83 | #[inline ] |
84 | pub fn end(&self) -> usize { |
85 | self.end |
86 | } |
87 | |
88 | /// Returns the match location as a range. |
89 | #[inline ] |
90 | pub fn range(&self) -> core::ops::Range<usize> { |
91 | self.start..self.end |
92 | } |
93 | |
94 | /// Returns true if and only if this match is empty. That is, when |
95 | /// `start() == end()`. |
96 | /// |
97 | /// An empty match can only be returned when the empty string was among |
98 | /// the patterns used to build the Aho-Corasick automaton. |
99 | #[inline ] |
100 | pub fn is_empty(&self) -> bool { |
101 | self.start == self.end |
102 | } |
103 | } |
104 | |
105 | /// A representation of a match reported by a DFA. |
106 | /// |
107 | /// This is called a "half" match because it only includes the end location |
108 | /// (or start location for a reverse match) of a match. This corresponds to the |
109 | /// information that a single DFA scan can report. Getting the other half of |
110 | /// the match requires a second scan with a reversed DFA. |
111 | /// |
112 | /// A half match also includes the pattern that matched. The pattern is |
113 | /// identified by an ID, which corresponds to its position (starting from `0`) |
114 | /// relative to other patterns used to construct the corresponding DFA. If only |
115 | /// a single pattern is provided to the DFA, then all matches are guaranteed to |
116 | /// have a pattern ID of `0`. |
117 | #[derive (Clone, Copy, Debug, Eq, Hash, PartialEq)] |
118 | pub struct HalfMatch { |
119 | /// The pattern ID. |
120 | pub(crate) pattern: PatternID, |
121 | /// The offset of the match. |
122 | /// |
123 | /// For forward searches, the offset is exclusive. For reverse searches, |
124 | /// the offset is inclusive. |
125 | pub(crate) offset: usize, |
126 | } |
127 | |
128 | impl HalfMatch { |
129 | /// Create a new half match from a pattern ID and a byte offset. |
130 | #[inline ] |
131 | pub fn new(pattern: PatternID, offset: usize) -> HalfMatch { |
132 | HalfMatch { pattern, offset } |
133 | } |
134 | |
135 | /// Create a new half match from a pattern ID and a byte offset. |
136 | /// |
137 | /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a |
138 | /// [`PatternID`]. This panics if the given `usize` is not representable |
139 | /// as a `PatternID`. |
140 | #[inline ] |
141 | pub fn must(pattern: usize, offset: usize) -> HalfMatch { |
142 | HalfMatch::new(PatternID::new(pattern).unwrap(), offset) |
143 | } |
144 | |
145 | /// Returns the ID of the pattern that matched. |
146 | /// |
147 | /// The ID of a pattern is derived from the position in which it was |
148 | /// originally inserted into the corresponding DFA. The first pattern has |
149 | /// identifier `0`, and each subsequent pattern is `1`, `2` and so on. |
150 | #[inline ] |
151 | pub fn pattern(&self) -> PatternID { |
152 | self.pattern |
153 | } |
154 | |
155 | /// The position of the match. |
156 | /// |
157 | /// If this match was produced by a forward search, then the offset is |
158 | /// exclusive. If this match was produced by a reverse search, then the |
159 | /// offset is inclusive. |
160 | #[inline ] |
161 | pub fn offset(&self) -> usize { |
162 | self.offset |
163 | } |
164 | } |
165 | |
166 | /// A representation of a multi match reported by a regex engine. |
167 | /// |
168 | /// A multi match has two essential pieces of information: the identifier of |
169 | /// the pattern that matched, along with the start and end offsets of the match |
170 | /// in the haystack. |
171 | /// |
172 | /// The pattern is identified by an ID, which corresponds to its position |
173 | /// (starting from `0`) relative to other patterns used to construct the |
174 | /// corresponding regex engine. If only a single pattern is provided, then all |
175 | /// multi matches are guaranteed to have a pattern ID of `0`. |
176 | /// |
177 | /// Every multi match guarantees that `start <= end`. |
178 | #[derive (Clone, Debug, Eq, Hash, PartialEq)] |
179 | pub struct MultiMatch { |
180 | /// The pattern ID. |
181 | pattern: PatternID, |
182 | /// The start offset of the match, inclusive. |
183 | start: usize, |
184 | /// The end offset of the match, exclusive. |
185 | end: usize, |
186 | } |
187 | |
188 | impl MultiMatch { |
189 | /// Create a new match from a pattern ID and a byte offset span. |
190 | /// |
191 | /// # Panics |
192 | /// |
193 | /// This panics if `end < start`. |
194 | #[inline ] |
195 | pub fn new(pattern: PatternID, start: usize, end: usize) -> MultiMatch { |
196 | assert!(start <= end); |
197 | MultiMatch { pattern, start, end } |
198 | } |
199 | |
200 | /// Create a new match from a pattern ID and a byte offset span. |
201 | /// |
202 | /// This is like [`MultiMatch::new`], but accepts a `usize` instead of a |
203 | /// [`PatternID`]. This panics if the given `usize` is not representable |
204 | /// as a `PatternID`. |
205 | /// |
206 | /// # Panics |
207 | /// |
208 | /// This panics if `end < start` or if `pattern > PatternID::MAX`. |
209 | #[inline ] |
210 | pub fn must(pattern: usize, start: usize, end: usize) -> MultiMatch { |
211 | MultiMatch::new(PatternID::new(pattern).unwrap(), start, end) |
212 | } |
213 | |
214 | /// Returns the ID of the pattern that matched. |
215 | /// |
216 | /// The ID of a pattern is derived from the position in which it was |
217 | /// originally inserted into the corresponding regex engine. The first |
218 | /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and |
219 | /// so on. |
220 | #[inline ] |
221 | pub fn pattern(&self) -> PatternID { |
222 | self.pattern |
223 | } |
224 | |
225 | /// The starting position of the match. |
226 | #[inline ] |
227 | pub fn start(&self) -> usize { |
228 | self.start |
229 | } |
230 | |
231 | /// The ending position of the match. |
232 | #[inline ] |
233 | pub fn end(&self) -> usize { |
234 | self.end |
235 | } |
236 | |
237 | /// Returns the match location as a range. |
238 | #[inline ] |
239 | pub fn range(&self) -> core::ops::Range<usize> { |
240 | self.start..self.end |
241 | } |
242 | |
243 | /// Returns true if and only if this match is empty. That is, when |
244 | /// `start() == end()`. |
245 | /// |
246 | /// An empty match can only be returned when the empty string was among |
247 | /// the patterns used to build the Aho-Corasick automaton. |
248 | #[inline ] |
249 | pub fn is_empty(&self) -> bool { |
250 | self.start == self.end |
251 | } |
252 | } |
253 | |
254 | /// An error type indicating that a search stopped prematurely without finding |
255 | /// a match. |
256 | /// |
257 | /// This error type implies that one cannot assume that no matches occur, since |
258 | /// the search stopped before completing. |
259 | /// |
260 | /// Normally, when one searches for something, the response is either an |
261 | /// affirmative "it was found at this location" or a negative "not found at |
262 | /// all." However, in some cases, a regex engine can be configured to stop its |
263 | /// search before concluding whether a match exists or not. When this happens, |
264 | /// it may be important for the caller to know why the regex engine gave up and |
265 | /// where in the input it gave up at. This error type exposes the 'why' and the |
266 | /// 'where.' |
267 | /// |
268 | /// For example, the DFAs provided by this library generally cannot correctly |
269 | /// implement Unicode word boundaries. Instead, they provide an option to |
270 | /// eagerly support them on ASCII text (since Unicode word boundaries are |
271 | /// equivalent to ASCII word boundaries when searching ASCII text), but will |
272 | /// "give up" if a non-ASCII byte is seen. In such cases, one is usually |
273 | /// required to either report the failure to the caller (unergonomic) or |
274 | /// otherwise fall back to some other regex engine (ergonomic, but potentially |
275 | /// costly). |
276 | /// |
277 | /// More generally, some regex engines offer the ability for callers to specify |
278 | /// certain bytes that will trigger the regex engine to automatically quit if |
279 | /// they are seen. |
280 | /// |
281 | /// Still yet, there may be other reasons for a failed match. For example, |
282 | /// the hybrid DFA provided by this crate can be configured to give up if it |
283 | /// believes that it is not efficient. This in turn permits callers to choose a |
284 | /// different regex engine. |
285 | /// |
286 | /// # Advice |
287 | /// |
288 | /// While this form of error reporting adds complexity, it is generally |
289 | /// possible for callers to configure regex engines to never give up a search, |
290 | /// and thus never return an error. Indeed, the default configuration for every |
291 | /// regex engine in this crate is such that they will never stop searching |
292 | /// early. Therefore, the only way to get a match error is if the regex engine |
293 | /// is explicitly configured to do so. Options that enable this behavior |
294 | /// document the new error conditions they imply. |
295 | /// |
296 | /// Regex engines for which no errors are possible for any configuration will |
297 | /// return the normal `Option<Match>` and not use this error type at all. |
298 | /// |
299 | /// For example, regex engines in the `dfa` sub-module will only report |
300 | /// `MatchError::Quit` if instructed by either |
301 | /// [enabling Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary) |
302 | /// or by |
303 | /// [explicitly specifying one or more quit bytes](crate::dfa::dense::Config::quit). |
304 | #[derive (Clone, Debug, Eq, Hash, PartialEq)] |
305 | pub enum MatchError { |
306 | // Note that the first version of this type was called `SearchError` and it |
307 | // included a third `None` variant to indicate that the search completed |
308 | // and no match was found. However, this was problematic for iterator |
309 | // APIs where the `None` sentinel for stopping iteration corresponds |
310 | // precisely to the "match not found" case. The fact that the `None` |
311 | // variant was buried inside this type was in turn quite awkward. So |
312 | // instead, I removed the `None` variant, renamed the type and used |
313 | // `Result<Option<Match>, MatchError>` in non-iterator APIs instead of the |
314 | // conceptually simpler `Result<Match, MatchError>`. However, we "regain" |
315 | // ergonomics by only putting the more complex API in the `try_` variants |
316 | // ("fallible") of search methods. The infallible APIs will instead just |
317 | // return `Option<Match>` and panic on error. |
318 | /// The search saw a "quit" byte at which it was instructed to stop |
319 | /// searching. |
320 | Quit { |
321 | /// The "quit" byte that was observed that caused the search to stop. |
322 | byte: u8, |
323 | /// The offset at which the quit byte was observed. |
324 | offset: usize, |
325 | }, |
326 | /// The search, based on heuristics, determined that it would be better |
327 | /// to stop, typically to provide the caller an opportunity to use an |
328 | /// alternative regex engine. |
329 | /// |
330 | /// Currently, the only way for this to occur is via the lazy DFA and |
331 | /// only when it is configured to do so (it will not return this error by |
332 | /// default). |
333 | GaveUp { |
334 | /// The offset at which the search stopped. This corresponds to the |
335 | /// position immediately following the last byte scanned. |
336 | offset: usize, |
337 | }, |
338 | } |
339 | |
340 | #[cfg (feature = "std" )] |
341 | impl std::error::Error for MatchError {} |
342 | |
343 | impl core::fmt::Display for MatchError { |
344 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
345 | match *self { |
346 | MatchError::Quit { byte: u8, offset: usize } => write!( |
347 | f, |
348 | "quit search after observing byte \\x {:02X} at offset {}" , |
349 | byte, offset, |
350 | ), |
351 | MatchError::GaveUp { offset: usize } => { |
352 | write!(f, "gave up searching at offset {}" , offset) |
353 | } |
354 | } |
355 | } |
356 | } |
357 | |