1use state_id::StateID;
2
3/// A trait describing the interface of a deterministic finite automaton (DFA).
4///
5/// Every DFA has exactly one start state and at least one dead state (which
6/// may be the same, as in the case of an empty DFA). In all cases, a state
7/// identifier of `0` must be a dead state such that `DFA::is_dead_state(0)`
8/// always returns `true`.
9///
10/// Every DFA also has zero or more match states, such that
11/// `DFA::is_match_state(id)` returns `true` if and only if `id` corresponds to
12/// a match state.
13///
14/// In general, users of this trait likely will only need to use the search
15/// routines such as `is_match`, `shortest_match`, `find` or `rfind`. The other
16/// methods are lower level and are used for walking the transitions of a DFA
17/// manually. In particular, the aforementioned search routines are implemented
18/// generically in terms of the lower level transition walking routines.
19pub trait DFA {
20 /// The representation used for state identifiers in this DFA.
21 ///
22 /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`.
23 type ID: StateID;
24
25 /// Return the identifier of this DFA's start state.
26 fn start_state(&self) -> Self::ID;
27
28 /// Returns true if and only if the given identifier corresponds to a match
29 /// state.
30 fn is_match_state(&self, id: Self::ID) -> bool;
31
32 /// Returns true if and only if the given identifier corresponds to a dead
33 /// state. When a DFA enters a dead state, it is impossible to leave and
34 /// thus can never lead to a match.
35 fn is_dead_state(&self, id: Self::ID) -> bool;
36
37 /// Returns true if and only if the given identifier corresponds to either
38 /// a dead state or a match state, such that one of `is_match_state(id)`
39 /// or `is_dead_state(id)` must return true.
40 ///
41 /// Depending on the implementation of the DFA, this routine can be used
42 /// to save a branch in the core matching loop. Nevertheless,
43 /// `is_match_state(id) || is_dead_state(id)` is always a valid
44 /// implementation.
45 fn is_match_or_dead_state(&self, id: Self::ID) -> bool;
46
47 /// Returns true if and only if this DFA is anchored.
48 ///
49 /// When a DFA is anchored, it is only allowed to report matches that
50 /// start at index `0`.
51 fn is_anchored(&self) -> bool;
52
53 /// Given the current state that this DFA is in and the next input byte,
54 /// this method returns the identifier of the next state. The identifier
55 /// returned is always valid, but it may correspond to a dead state.
56 fn next_state(&self, current: Self::ID, input: u8) -> Self::ID;
57
58 /// Like `next_state`, but its implementation may look up the next state
59 /// without memory safety checks such as bounds checks. As such, callers
60 /// must ensure that the given identifier corresponds to a valid DFA
61 /// state. Implementors must, in turn, ensure that this routine is safe
62 /// for all valid state identifiers and for all possible `u8` values.
63 unsafe fn next_state_unchecked(
64 &self,
65 current: Self::ID,
66 input: u8,
67 ) -> Self::ID;
68
69 /// Returns true if and only if the given bytes match this DFA.
70 ///
71 /// This routine may short circuit if it knows that scanning future input
72 /// will never lead to a different result. In particular, if a DFA enters
73 /// a match state or a dead state, then this routine will return `true` or
74 /// `false`, respectively, without inspecting any future input.
75 ///
76 /// # Example
77 ///
78 /// This example shows how to use this method with a
79 /// [`DenseDFA`](enum.DenseDFA.html).
80 ///
81 /// ```
82 /// use regex_automata::{DFA, DenseDFA};
83 ///
84 /// # fn example() -> Result<(), regex_automata::Error> {
85 /// let dfa = DenseDFA::new("foo[0-9]+bar")?;
86 /// assert_eq!(true, dfa.is_match(b"foo12345bar"));
87 /// assert_eq!(false, dfa.is_match(b"foobar"));
88 /// # Ok(()) }; example().unwrap()
89 /// ```
90 #[inline]
91 fn is_match(&self, bytes: &[u8]) -> bool {
92 self.is_match_at(bytes, 0)
93 }
94
95 /// Returns the first position at which a match is found.
96 ///
97 /// This routine stops scanning input in precisely the same circumstances
98 /// as `is_match`. The key difference is that this routine returns the
99 /// position at which it stopped scanning input if and only if a match
100 /// was found. If no match is found, then `None` is returned.
101 ///
102 /// # Example
103 ///
104 /// This example shows how to use this method with a
105 /// [`DenseDFA`](enum.DenseDFA.html).
106 ///
107 /// ```
108 /// use regex_automata::{DFA, DenseDFA};
109 ///
110 /// # fn example() -> Result<(), regex_automata::Error> {
111 /// let dfa = DenseDFA::new("foo[0-9]+")?;
112 /// assert_eq!(Some(4), dfa.shortest_match(b"foo12345"));
113 ///
114 /// // Normally, the end of the leftmost first match here would be 3,
115 /// // but the shortest match semantics detect a match earlier.
116 /// let dfa = DenseDFA::new("abc|a")?;
117 /// assert_eq!(Some(1), dfa.shortest_match(b"abc"));
118 /// # Ok(()) }; example().unwrap()
119 /// ```
120 #[inline]
121 fn shortest_match(&self, bytes: &[u8]) -> Option<usize> {
122 self.shortest_match_at(bytes, 0)
123 }
124
125 /// Returns the end offset of the longest match. If no match exists,
126 /// then `None` is returned.
127 ///
128 /// Implementors of this trait are not required to implement any particular
129 /// match semantics (such as leftmost-first), which are instead manifest in
130 /// the DFA's topology itself.
131 ///
132 /// In particular, this method must continue searching even after it
133 /// enters a match state. The search should only terminate once it has
134 /// reached the end of the input or when it has entered a dead state. Upon
135 /// termination, the position of the last byte seen while still in a match
136 /// state is returned.
137 ///
138 /// # Example
139 ///
140 /// This example shows how to use this method with a
141 /// [`DenseDFA`](enum.DenseDFA.html). By default, a dense DFA uses
142 /// "leftmost first" match semantics.
143 ///
144 /// Leftmost first match semantics corresponds to the match with the
145 /// smallest starting offset, but where the end offset is determined by
146 /// preferring earlier branches in the original regular expression. For
147 /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
148 /// will match `Samwise` in `Samwise`.
149 ///
150 /// Generally speaking, the "leftmost first" match is how most backtracking
151 /// regular expressions tend to work. This is in contrast to POSIX-style
152 /// regular expressions that yield "leftmost longest" matches. Namely,
153 /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
154 /// leftmost longest semantics.
155 ///
156 /// ```
157 /// use regex_automata::{DFA, DenseDFA};
158 ///
159 /// # fn example() -> Result<(), regex_automata::Error> {
160 /// let dfa = DenseDFA::new("foo[0-9]+")?;
161 /// assert_eq!(Some(8), dfa.find(b"foo12345"));
162 ///
163 /// // Even though a match is found after reading the first byte (`a`),
164 /// // the leftmost first match semantics demand that we find the earliest
165 /// // match that prefers earlier parts of the pattern over latter parts.
166 /// let dfa = DenseDFA::new("abc|a")?;
167 /// assert_eq!(Some(3), dfa.find(b"abc"));
168 /// # Ok(()) }; example().unwrap()
169 /// ```
170 #[inline]
171 fn find(&self, bytes: &[u8]) -> Option<usize> {
172 self.find_at(bytes, 0)
173 }
174
175 /// Returns the start offset of the longest match in reverse, by searching
176 /// from the end of the input towards the start of the input. If no match
177 /// exists, then `None` is returned. In other words, this has the same
178 /// match semantics as `find`, but in reverse.
179 ///
180 /// # Example
181 ///
182 /// This example shows how to use this method with a
183 /// [`DenseDFA`](enum.DenseDFA.html). In particular, this routine
184 /// is principally useful when used in conjunction with the
185 /// [`dense::Builder::reverse`](dense/struct.Builder.html#method.reverse)
186 /// configuration knob. In general, it's unlikely to be correct to use both
187 /// `find` and `rfind` with the same DFA since any particular DFA will only
188 /// support searching in one direction.
189 ///
190 /// ```
191 /// use regex_automata::{dense, DFA};
192 ///
193 /// # fn example() -> Result<(), regex_automata::Error> {
194 /// let dfa = dense::Builder::new().reverse(true).build("foo[0-9]+")?;
195 /// assert_eq!(Some(0), dfa.rfind(b"foo12345"));
196 /// # Ok(()) }; example().unwrap()
197 /// ```
198 #[inline]
199 fn rfind(&self, bytes: &[u8]) -> Option<usize> {
200 self.rfind_at(bytes, bytes.len())
201 }
202
203 /// Returns the same as `is_match`, but starts the search at the given
204 /// offset.
205 ///
206 /// The significance of the starting point is that it takes the surrounding
207 /// context into consideration. For example, if the DFA is anchored, then
208 /// a match can only occur when `start == 0`.
209 #[inline]
210 fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
211 if self.is_anchored() && start > 0 {
212 return false;
213 }
214
215 let mut state = self.start_state();
216 if self.is_match_or_dead_state(state) {
217 return self.is_match_state(state);
218 }
219 for &b in bytes[start..].iter() {
220 state = unsafe { self.next_state_unchecked(state, b) };
221 if self.is_match_or_dead_state(state) {
222 return self.is_match_state(state);
223 }
224 }
225 false
226 }
227
228 /// Returns the same as `shortest_match`, but starts the search at the
229 /// given offset.
230 ///
231 /// The significance of the starting point is that it takes the surrounding
232 /// context into consideration. For example, if the DFA is anchored, then
233 /// a match can only occur when `start == 0`.
234 #[inline]
235 fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
236 if self.is_anchored() && start > 0 {
237 return None;
238 }
239
240 let mut state = self.start_state();
241 if self.is_match_or_dead_state(state) {
242 return if self.is_dead_state(state) { None } else { Some(start) };
243 }
244 for (i, &b) in bytes[start..].iter().enumerate() {
245 state = unsafe { self.next_state_unchecked(state, b) };
246 if self.is_match_or_dead_state(state) {
247 return if self.is_dead_state(state) {
248 None
249 } else {
250 Some(start + i + 1)
251 };
252 }
253 }
254 None
255 }
256
257 /// Returns the same as `find`, but starts the search at the given
258 /// offset.
259 ///
260 /// The significance of the starting point is that it takes the surrounding
261 /// context into consideration. For example, if the DFA is anchored, then
262 /// a match can only occur when `start == 0`.
263 #[inline]
264 fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
265 if self.is_anchored() && start > 0 {
266 return None;
267 }
268
269 let mut state = self.start_state();
270 let mut last_match = if self.is_dead_state(state) {
271 return None;
272 } else if self.is_match_state(state) {
273 Some(start)
274 } else {
275 None
276 };
277 for (i, &b) in bytes[start..].iter().enumerate() {
278 state = unsafe { self.next_state_unchecked(state, b) };
279 if self.is_match_or_dead_state(state) {
280 if self.is_dead_state(state) {
281 return last_match;
282 }
283 last_match = Some(start + i + 1);
284 }
285 }
286 last_match
287 }
288
289 /// Returns the same as `rfind`, but starts the search at the given
290 /// offset.
291 ///
292 /// The significance of the starting point is that it takes the surrounding
293 /// context into consideration. For example, if the DFA is anchored, then
294 /// a match can only occur when `start == bytes.len()`.
295 #[inline(never)]
296 fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
297 if self.is_anchored() && start < bytes.len() {
298 return None;
299 }
300
301 let mut state = self.start_state();
302 let mut last_match = if self.is_dead_state(state) {
303 return None;
304 } else if self.is_match_state(state) {
305 Some(start)
306 } else {
307 None
308 };
309 for (i, &b) in bytes[..start].iter().enumerate().rev() {
310 state = unsafe { self.next_state_unchecked(state, b) };
311 if self.is_match_or_dead_state(state) {
312 if self.is_dead_state(state) {
313 return last_match;
314 }
315 last_match = Some(i);
316 }
317 }
318 last_match
319 }
320}
321
322impl<'a, T: DFA> DFA for &'a T {
323 type ID = T::ID;
324
325 #[inline]
326 fn start_state(&self) -> Self::ID {
327 (**self).start_state()
328 }
329
330 #[inline]
331 fn is_match_state(&self, id: Self::ID) -> bool {
332 (**self).is_match_state(id)
333 }
334
335 #[inline]
336 fn is_match_or_dead_state(&self, id: Self::ID) -> bool {
337 (**self).is_match_or_dead_state(id)
338 }
339
340 #[inline]
341 fn is_dead_state(&self, id: Self::ID) -> bool {
342 (**self).is_dead_state(id)
343 }
344
345 #[inline]
346 fn is_anchored(&self) -> bool {
347 (**self).is_anchored()
348 }
349
350 #[inline]
351 fn next_state(&self, current: Self::ID, input: u8) -> Self::ID {
352 (**self).next_state(current, input)
353 }
354
355 #[inline]
356 unsafe fn next_state_unchecked(
357 &self,
358 current: Self::ID,
359 input: u8,
360 ) -> Self::ID {
361 (**self).next_state_unchecked(current, input)
362 }
363}
364