matchtypes.rs source code [crates/regex_automata/src/util/matchtypes.rs]

1	use crate::util::id::PatternID;
2
3	/// The kind of match semantics to use for a DFA.
4	///
5	/// The default match kind is `LeftmostFirst`.
6	#[derive(Clone, Copy, Debug, Eq, PartialEq)]
7	pub enum MatchKind {
8	/// Report all possible matches.
9	All,
10	/// Report only the leftmost matches. When multiple leftmost matches exist,
11	/// report the match corresponding to the part of the regex that appears
12	/// first in the syntax.
13	LeftmostFirst,
14	/// Hints that destructuring should not be exhaustive.
15	///
16	/// This enum may grow additional variants, so this makes sure clients
17	/// don't count on exhaustive matching. (Otherwise, adding a new variant
18	/// could break existing code.)
19	#[doc(hidden)]
20	__Nonexhaustive,
21	// There is prior art in RE2 that shows that we should be able to add
22	// LeftmostLongest too. The tricky part of it is supporting ungreedy
23	// repetitions. Instead of treating all NFA states as having equivalent
24	// priority (as in 'All') or treating all NFA states as having distinct
25	// priority based on order (as in 'LeftmostFirst'), we instead group NFA
26	// states into sets, and treat members of each set as having equivalent
27	// priority, but having greater priority than all following members
28	// of different sets.
29	//
30	// However, it's not clear whether it's really worth adding this. After
31	// all, leftmost-longest can be emulated when using literals by using
32	// leftmost-first and sorting the literals by length in descending order.
33	// However, this won't work for arbitrary regexes. e.g., `\w\|\w\w` will
34	// always match `a` in `ab` when using leftmost-first, but leftmost-longest
35	// would match `ab`.
36	}
37
38	impl MatchKind {
39	#[cfg(feature = "alloc")]
40	pub(crate) fn continue_past_first_match(&self) -> bool {
41	*self == MatchKind::All
42	}
43	}
44
45	impl Default for MatchKind {
46	fn default() -> MatchKind {
47	MatchKind::LeftmostFirst
48	}
49	}
50
51	/// A representation of a match reported by a regex engine.
52	///
53	/// A match records the start and end offsets of the match in the haystack.
54	///
55	/// Every match guarantees that `start <= end`.
56	#[derive(Clone, Debug, Eq, Hash, PartialEq)]
57	pub struct Match {
58	/// The start offset of the match, inclusive.
59	start: usize,
60	/// The end offset of the match, exclusive.
61	end: usize,
62	}
63
64	impl Match {
65	/// Create a new match from a byte offset span.
66	///
67	/// # Panics
68	///
69	/// This panics if `end < start`.
70	#[inline]
71	pub fn new(start: usize, end: usize) -> Match {
72	assert!(start <= end);
73	Match { start, end }
74	}
75
76	/// The starting position of the match.
77	#[inline]
78	pub fn start(&self) -> usize {
79	self.start
80	}
81
82	/// The ending position of the match.
83	#[inline]
84	pub fn end(&self) -> usize {
85	self.end
86	}
87
88	/// Returns the match location as a range.
89	#[inline]
90	pub fn range(&self) -> core::ops::Range<usize> {
91	self.start..self.end
92	}
93
94	/// Returns true if and only if this match is empty. That is, when
95	/// `start() == end()`.
96	///
97	/// An empty match can only be returned when the empty string was among
98	/// the patterns used to build the Aho-Corasick automaton.
99	#[inline]
100	pub fn is_empty(&self) -> bool {
101	self.start == self.end
102	}
103	}
104
105	/// A representation of a match reported by a DFA.
106	///
107	/// This is called a "half" match because it only includes the end location
108	/// (or start location for a reverse match) of a match. This corresponds to the
109	/// information that a single DFA scan can report. Getting the other half of
110	/// the match requires a second scan with a reversed DFA.
111	///
112	/// A half match also includes the pattern that matched. The pattern is
113	/// identified by an ID, which corresponds to its position (starting from `0`)
114	/// relative to other patterns used to construct the corresponding DFA. If only
115	/// a single pattern is provided to the DFA, then all matches are guaranteed to
116	/// have a pattern ID of `0`.
117	#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
118	pub struct HalfMatch {
119	/// The pattern ID.
120	pub(crate) pattern: PatternID,
121	/// The offset of the match.
122	///
123	/// For forward searches, the offset is exclusive. For reverse searches,
124	/// the offset is inclusive.
125	pub(crate) offset: usize,
126	}
127
128	impl HalfMatch {
129	/// Create a new half match from a pattern ID and a byte offset.
130	#[inline]
131	pub fn new(pattern: PatternID, offset: usize) -> HalfMatch {
132	HalfMatch { pattern, offset }
133	}
134
135	/// Create a new half match from a pattern ID and a byte offset.
136	///
137	/// This is like [`HalfMatch::new`], but accepts a `usize` instead of a
138	/// [`PatternID`]. This panics if the given `usize` is not representable
139	/// as a `PatternID`.
140	#[inline]
141	pub fn must(pattern: usize, offset: usize) -> HalfMatch {
142	HalfMatch::new(PatternID::new(pattern).unwrap(), offset)
143	}
144
145	/// Returns the ID of the pattern that matched.
146	///
147	/// The ID of a pattern is derived from the position in which it was
148	/// originally inserted into the corresponding DFA. The first pattern has
149	/// identifier `0`, and each subsequent pattern is `1`, `2` and so on.
150	#[inline]
151	pub fn pattern(&self) -> PatternID {
152	self.pattern
153	}
154
155	/// The position of the match.
156	///
157	/// If this match was produced by a forward search, then the offset is
158	/// exclusive. If this match was produced by a reverse search, then the
159	/// offset is inclusive.
160	#[inline]
161	pub fn offset(&self) -> usize {
162	self.offset
163	}
164	}
165
166	/// A representation of a multi match reported by a regex engine.
167	///
168	/// A multi match has two essential pieces of information: the identifier of
169	/// the pattern that matched, along with the start and end offsets of the match
170	/// in the haystack.
171	///
172	/// The pattern is identified by an ID, which corresponds to its position
173	/// (starting from `0`) relative to other patterns used to construct the
174	/// corresponding regex engine. If only a single pattern is provided, then all
175	/// multi matches are guaranteed to have a pattern ID of `0`.
176	///
177	/// Every multi match guarantees that `start <= end`.
178	#[derive(Clone, Debug, Eq, Hash, PartialEq)]
179	pub struct MultiMatch {
180	/// The pattern ID.
181	pattern: PatternID,
182	/// The start offset of the match, inclusive.
183	start: usize,
184	/// The end offset of the match, exclusive.
185	end: usize,
186	}
187
188	impl MultiMatch {
189	/// Create a new match from a pattern ID and a byte offset span.
190	///
191	/// # Panics
192	///
193	/// This panics if `end < start`.
194	#[inline]
195	pub fn new(pattern: PatternID, start: usize, end: usize) -> MultiMatch {
196	assert!(start <= end);
197	MultiMatch { pattern, start, end }
198	}
199
200	/// Create a new match from a pattern ID and a byte offset span.
201	///
202	/// This is like [`MultiMatch::new`], but accepts a `usize` instead of a
203	/// [`PatternID`]. This panics if the given `usize` is not representable
204	/// as a `PatternID`.
205	///
206	/// # Panics
207	///
208	/// This panics if `end < start` or if `pattern > PatternID::MAX`.
209	#[inline]
210	pub fn must(pattern: usize, start: usize, end: usize) -> MultiMatch {
211	MultiMatch::new(PatternID::new(pattern).unwrap(), start, end)
212	}
213
214	/// Returns the ID of the pattern that matched.
215	///
216	/// The ID of a pattern is derived from the position in which it was
217	/// originally inserted into the corresponding regex engine. The first
218	/// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and
219	/// so on.
220	#[inline]
221	pub fn pattern(&self) -> PatternID {
222	self.pattern
223	}
224
225	/// The starting position of the match.
226	#[inline]
227	pub fn start(&self) -> usize {
228	self.start
229	}
230
231	/// The ending position of the match.
232	#[inline]
233	pub fn end(&self) -> usize {
234	self.end
235	}
236
237	/// Returns the match location as a range.
238	#[inline]
239	pub fn range(&self) -> core::ops::Range<usize> {
240	self.start..self.end
241	}
242
243	/// Returns true if and only if this match is empty. That is, when
244	/// `start() == end()`.
245	///
246	/// An empty match can only be returned when the empty string was among
247	/// the patterns used to build the Aho-Corasick automaton.
248	#[inline]
249	pub fn is_empty(&self) -> bool {
250	self.start == self.end
251	}
252	}
253
254	/// An error type indicating that a search stopped prematurely without finding
255	/// a match.
256	///
257	/// This error type implies that one cannot assume that no matches occur, since
258	/// the search stopped before completing.
259	///
260	/// Normally, when one searches for something, the response is either an
261	/// affirmative "it was found at this location" or a negative "not found at
262	/// all." However, in some cases, a regex engine can be configured to stop its
263	/// search before concluding whether a match exists or not. When this happens,
264	/// it may be important for the caller to know why the regex engine gave up and
265	/// where in the input it gave up at. This error type exposes the 'why' and the
266	/// 'where.'
267	///
268	/// For example, the DFAs provided by this library generally cannot correctly
269	/// implement Unicode word boundaries. Instead, they provide an option to
270	/// eagerly support them on ASCII text (since Unicode word boundaries are
271	/// equivalent to ASCII word boundaries when searching ASCII text), but will
272	/// "give up" if a non-ASCII byte is seen. In such cases, one is usually
273	/// required to either report the failure to the caller (unergonomic) or
274	/// otherwise fall back to some other regex engine (ergonomic, but potentially
275	/// costly).
276	///
277	/// More generally, some regex engines offer the ability for callers to specify
278	/// certain bytes that will trigger the regex engine to automatically quit if
279	/// they are seen.
280	///
281	/// Still yet, there may be other reasons for a failed match. For example,
282	/// the hybrid DFA provided by this crate can be configured to give up if it
283	/// believes that it is not efficient. This in turn permits callers to choose a
284	/// different regex engine.
285	///
286	/// # Advice
287	///
288	/// While this form of error reporting adds complexity, it is generally
289	/// possible for callers to configure regex engines to never give up a search,
290	/// and thus never return an error. Indeed, the default configuration for every
291	/// regex engine in this crate is such that they will never stop searching
292	/// early. Therefore, the only way to get a match error is if the regex engine
293	/// is explicitly configured to do so. Options that enable this behavior
294	/// document the new error conditions they imply.
295	///
296	/// Regex engines for which no errors are possible for any configuration will
297	/// return the normal `Option<Match>` and not use this error type at all.
298	///
299	/// For example, regex engines in the `dfa` sub-module will only report
300	/// `MatchError::Quit` if instructed by either
301	/// [enabling Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary)
302	/// or by
303	/// [explicitly specifying one or more quit bytes](crate::dfa::dense::Config::quit).
304	#[derive(Clone, Debug, Eq, Hash, PartialEq)]
305	pub enum MatchError {
306	// Note that the first version of this type was called `SearchError` and it
307	// included a third `None` variant to indicate that the search completed
308	// and no match was found. However, this was problematic for iterator
309	// APIs where the `None` sentinel for stopping iteration corresponds
310	// precisely to the "match not found" case. The fact that the `None`
311	// variant was buried inside this type was in turn quite awkward. So
312	// instead, I removed the `None` variant, renamed the type and used
313	// `Result<Option<Match>, MatchError>` in non-iterator APIs instead of the
314	// conceptually simpler `Result<Match, MatchError>`. However, we "regain"
315	// ergonomics by only putting the more complex API in the `try_` variants
316	// ("fallible") of search methods. The infallible APIs will instead just
317	// return `Option<Match>` and panic on error.
318	/// The search saw a "quit" byte at which it was instructed to stop
319	/// searching.
320	Quit {
321	/// The "quit" byte that was observed that caused the search to stop.
322	byte: u8,
323	/// The offset at which the quit byte was observed.
324	offset: usize,
325	},
326	/// The search, based on heuristics, determined that it would be better
327	/// to stop, typically to provide the caller an opportunity to use an
328	/// alternative regex engine.
329	///
330	/// Currently, the only way for this to occur is via the lazy DFA and
331	/// only when it is configured to do so (it will not return this error by
332	/// default).
333	GaveUp {
334	/// The offset at which the search stopped. This corresponds to the
335	/// position immediately following the last byte scanned.
336	offset: usize,
337	},
338	}
339
340	#[cfg(feature = "std")]
341	impl std::error::Error for MatchError {}
342
343	impl core::fmt::Display for MatchError {
344	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
345	match *self {
346	MatchError::Quit { byte: u8, offset: usize } => write!(
347	f,
348	"quit search after observing byte `\\`x{:`02`X} at offset {}",
349	byte, offset,
350	),
351	MatchError::GaveUp { offset: usize } => {
352	write!(f, "gave up searching at offset {}", offset)
353	}
354	}
355	}
356	}
357