1 | use state_id::StateID; |
2 | |
3 | /// A trait describing the interface of a deterministic finite automaton (DFA). |
4 | /// |
5 | /// Every DFA has exactly one start state and at least one dead state (which |
6 | /// may be the same, as in the case of an empty DFA). In all cases, a state |
7 | /// identifier of `0` must be a dead state such that `DFA::is_dead_state(0)` |
8 | /// always returns `true`. |
9 | /// |
10 | /// Every DFA also has zero or more match states, such that |
11 | /// `DFA::is_match_state(id)` returns `true` if and only if `id` corresponds to |
12 | /// a match state. |
13 | /// |
14 | /// In general, users of this trait likely will only need to use the search |
15 | /// routines such as `is_match`, `shortest_match`, `find` or `rfind`. The other |
16 | /// methods are lower level and are used for walking the transitions of a DFA |
17 | /// manually. In particular, the aforementioned search routines are implemented |
18 | /// generically in terms of the lower level transition walking routines. |
19 | pub trait DFA { |
20 | /// The representation used for state identifiers in this DFA. |
21 | /// |
22 | /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`. |
23 | type ID: StateID; |
24 | |
25 | /// Return the identifier of this DFA's start state. |
26 | fn start_state(&self) -> Self::ID; |
27 | |
28 | /// Returns true if and only if the given identifier corresponds to a match |
29 | /// state. |
30 | fn is_match_state(&self, id: Self::ID) -> bool; |
31 | |
32 | /// Returns true if and only if the given identifier corresponds to a dead |
33 | /// state. When a DFA enters a dead state, it is impossible to leave and |
34 | /// thus can never lead to a match. |
35 | fn is_dead_state(&self, id: Self::ID) -> bool; |
36 | |
37 | /// Returns true if and only if the given identifier corresponds to either |
38 | /// a dead state or a match state, such that one of `is_match_state(id)` |
39 | /// or `is_dead_state(id)` must return true. |
40 | /// |
41 | /// Depending on the implementation of the DFA, this routine can be used |
42 | /// to save a branch in the core matching loop. Nevertheless, |
43 | /// `is_match_state(id) || is_dead_state(id)` is always a valid |
44 | /// implementation. |
45 | fn is_match_or_dead_state(&self, id: Self::ID) -> bool; |
46 | |
47 | /// Returns true if and only if this DFA is anchored. |
48 | /// |
49 | /// When a DFA is anchored, it is only allowed to report matches that |
50 | /// start at index `0`. |
51 | fn is_anchored(&self) -> bool; |
52 | |
53 | /// Given the current state that this DFA is in and the next input byte, |
54 | /// this method returns the identifier of the next state. The identifier |
55 | /// returned is always valid, but it may correspond to a dead state. |
56 | fn next_state(&self, current: Self::ID, input: u8) -> Self::ID; |
57 | |
58 | /// Like `next_state`, but its implementation may look up the next state |
59 | /// without memory safety checks such as bounds checks. As such, callers |
60 | /// must ensure that the given identifier corresponds to a valid DFA |
61 | /// state. Implementors must, in turn, ensure that this routine is safe |
62 | /// for all valid state identifiers and for all possible `u8` values. |
63 | unsafe fn next_state_unchecked( |
64 | &self, |
65 | current: Self::ID, |
66 | input: u8, |
67 | ) -> Self::ID; |
68 | |
69 | /// Returns true if and only if the given bytes match this DFA. |
70 | /// |
71 | /// This routine may short circuit if it knows that scanning future input |
72 | /// will never lead to a different result. In particular, if a DFA enters |
73 | /// a match state or a dead state, then this routine will return `true` or |
74 | /// `false`, respectively, without inspecting any future input. |
75 | /// |
76 | /// # Example |
77 | /// |
78 | /// This example shows how to use this method with a |
79 | /// [`DenseDFA`](enum.DenseDFA.html). |
80 | /// |
81 | /// ``` |
82 | /// use regex_automata::{DFA, DenseDFA}; |
83 | /// |
84 | /// # fn example() -> Result<(), regex_automata::Error> { |
85 | /// let dfa = DenseDFA::new("foo[0-9]+bar" )?; |
86 | /// assert_eq!(true, dfa.is_match(b"foo12345bar" )); |
87 | /// assert_eq!(false, dfa.is_match(b"foobar" )); |
88 | /// # Ok(()) }; example().unwrap() |
89 | /// ``` |
90 | #[inline ] |
91 | fn is_match(&self, bytes: &[u8]) -> bool { |
92 | self.is_match_at(bytes, 0) |
93 | } |
94 | |
95 | /// Returns the first position at which a match is found. |
96 | /// |
97 | /// This routine stops scanning input in precisely the same circumstances |
98 | /// as `is_match`. The key difference is that this routine returns the |
99 | /// position at which it stopped scanning input if and only if a match |
100 | /// was found. If no match is found, then `None` is returned. |
101 | /// |
102 | /// # Example |
103 | /// |
104 | /// This example shows how to use this method with a |
105 | /// [`DenseDFA`](enum.DenseDFA.html). |
106 | /// |
107 | /// ``` |
108 | /// use regex_automata::{DFA, DenseDFA}; |
109 | /// |
110 | /// # fn example() -> Result<(), regex_automata::Error> { |
111 | /// let dfa = DenseDFA::new("foo[0-9]+" )?; |
112 | /// assert_eq!(Some(4), dfa.shortest_match(b"foo12345" )); |
113 | /// |
114 | /// // Normally, the end of the leftmost first match here would be 3, |
115 | /// // but the shortest match semantics detect a match earlier. |
116 | /// let dfa = DenseDFA::new("abc|a" )?; |
117 | /// assert_eq!(Some(1), dfa.shortest_match(b"abc" )); |
118 | /// # Ok(()) }; example().unwrap() |
119 | /// ``` |
120 | #[inline ] |
121 | fn shortest_match(&self, bytes: &[u8]) -> Option<usize> { |
122 | self.shortest_match_at(bytes, 0) |
123 | } |
124 | |
125 | /// Returns the end offset of the longest match. If no match exists, |
126 | /// then `None` is returned. |
127 | /// |
128 | /// Implementors of this trait are not required to implement any particular |
129 | /// match semantics (such as leftmost-first), which are instead manifest in |
130 | /// the DFA's topology itself. |
131 | /// |
132 | /// In particular, this method must continue searching even after it |
133 | /// enters a match state. The search should only terminate once it has |
134 | /// reached the end of the input or when it has entered a dead state. Upon |
135 | /// termination, the position of the last byte seen while still in a match |
136 | /// state is returned. |
137 | /// |
138 | /// # Example |
139 | /// |
140 | /// This example shows how to use this method with a |
141 | /// [`DenseDFA`](enum.DenseDFA.html). By default, a dense DFA uses |
142 | /// "leftmost first" match semantics. |
143 | /// |
144 | /// Leftmost first match semantics corresponds to the match with the |
145 | /// smallest starting offset, but where the end offset is determined by |
146 | /// preferring earlier branches in the original regular expression. For |
147 | /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` |
148 | /// will match `Samwise` in `Samwise`. |
149 | /// |
150 | /// Generally speaking, the "leftmost first" match is how most backtracking |
151 | /// regular expressions tend to work. This is in contrast to POSIX-style |
152 | /// regular expressions that yield "leftmost longest" matches. Namely, |
153 | /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using |
154 | /// leftmost longest semantics. |
155 | /// |
156 | /// ``` |
157 | /// use regex_automata::{DFA, DenseDFA}; |
158 | /// |
159 | /// # fn example() -> Result<(), regex_automata::Error> { |
160 | /// let dfa = DenseDFA::new("foo[0-9]+" )?; |
161 | /// assert_eq!(Some(8), dfa.find(b"foo12345" )); |
162 | /// |
163 | /// // Even though a match is found after reading the first byte (`a`), |
164 | /// // the leftmost first match semantics demand that we find the earliest |
165 | /// // match that prefers earlier parts of the pattern over latter parts. |
166 | /// let dfa = DenseDFA::new("abc|a" )?; |
167 | /// assert_eq!(Some(3), dfa.find(b"abc" )); |
168 | /// # Ok(()) }; example().unwrap() |
169 | /// ``` |
170 | #[inline ] |
171 | fn find(&self, bytes: &[u8]) -> Option<usize> { |
172 | self.find_at(bytes, 0) |
173 | } |
174 | |
175 | /// Returns the start offset of the longest match in reverse, by searching |
176 | /// from the end of the input towards the start of the input. If no match |
177 | /// exists, then `None` is returned. In other words, this has the same |
178 | /// match semantics as `find`, but in reverse. |
179 | /// |
180 | /// # Example |
181 | /// |
182 | /// This example shows how to use this method with a |
183 | /// [`DenseDFA`](enum.DenseDFA.html). In particular, this routine |
184 | /// is principally useful when used in conjunction with the |
185 | /// [`dense::Builder::reverse`](dense/struct.Builder.html#method.reverse) |
186 | /// configuration knob. In general, it's unlikely to be correct to use both |
187 | /// `find` and `rfind` with the same DFA since any particular DFA will only |
188 | /// support searching in one direction. |
189 | /// |
190 | /// ``` |
191 | /// use regex_automata::{dense, DFA}; |
192 | /// |
193 | /// # fn example() -> Result<(), regex_automata::Error> { |
194 | /// let dfa = dense::Builder::new().reverse(true).build("foo[0-9]+" )?; |
195 | /// assert_eq!(Some(0), dfa.rfind(b"foo12345" )); |
196 | /// # Ok(()) }; example().unwrap() |
197 | /// ``` |
198 | #[inline ] |
199 | fn rfind(&self, bytes: &[u8]) -> Option<usize> { |
200 | self.rfind_at(bytes, bytes.len()) |
201 | } |
202 | |
203 | /// Returns the same as `is_match`, but starts the search at the given |
204 | /// offset. |
205 | /// |
206 | /// The significance of the starting point is that it takes the surrounding |
207 | /// context into consideration. For example, if the DFA is anchored, then |
208 | /// a match can only occur when `start == 0`. |
209 | #[inline ] |
210 | fn is_match_at(&self, bytes: &[u8], start: usize) -> bool { |
211 | if self.is_anchored() && start > 0 { |
212 | return false; |
213 | } |
214 | |
215 | let mut state = self.start_state(); |
216 | if self.is_match_or_dead_state(state) { |
217 | return self.is_match_state(state); |
218 | } |
219 | for &b in bytes[start..].iter() { |
220 | state = unsafe { self.next_state_unchecked(state, b) }; |
221 | if self.is_match_or_dead_state(state) { |
222 | return self.is_match_state(state); |
223 | } |
224 | } |
225 | false |
226 | } |
227 | |
228 | /// Returns the same as `shortest_match`, but starts the search at the |
229 | /// given offset. |
230 | /// |
231 | /// The significance of the starting point is that it takes the surrounding |
232 | /// context into consideration. For example, if the DFA is anchored, then |
233 | /// a match can only occur when `start == 0`. |
234 | #[inline ] |
235 | fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> { |
236 | if self.is_anchored() && start > 0 { |
237 | return None; |
238 | } |
239 | |
240 | let mut state = self.start_state(); |
241 | if self.is_match_or_dead_state(state) { |
242 | return if self.is_dead_state(state) { None } else { Some(start) }; |
243 | } |
244 | for (i, &b) in bytes[start..].iter().enumerate() { |
245 | state = unsafe { self.next_state_unchecked(state, b) }; |
246 | if self.is_match_or_dead_state(state) { |
247 | return if self.is_dead_state(state) { |
248 | None |
249 | } else { |
250 | Some(start + i + 1) |
251 | }; |
252 | } |
253 | } |
254 | None |
255 | } |
256 | |
257 | /// Returns the same as `find`, but starts the search at the given |
258 | /// offset. |
259 | /// |
260 | /// The significance of the starting point is that it takes the surrounding |
261 | /// context into consideration. For example, if the DFA is anchored, then |
262 | /// a match can only occur when `start == 0`. |
263 | #[inline ] |
264 | fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> { |
265 | if self.is_anchored() && start > 0 { |
266 | return None; |
267 | } |
268 | |
269 | let mut state = self.start_state(); |
270 | let mut last_match = if self.is_dead_state(state) { |
271 | return None; |
272 | } else if self.is_match_state(state) { |
273 | Some(start) |
274 | } else { |
275 | None |
276 | }; |
277 | for (i, &b) in bytes[start..].iter().enumerate() { |
278 | state = unsafe { self.next_state_unchecked(state, b) }; |
279 | if self.is_match_or_dead_state(state) { |
280 | if self.is_dead_state(state) { |
281 | return last_match; |
282 | } |
283 | last_match = Some(start + i + 1); |
284 | } |
285 | } |
286 | last_match |
287 | } |
288 | |
289 | /// Returns the same as `rfind`, but starts the search at the given |
290 | /// offset. |
291 | /// |
292 | /// The significance of the starting point is that it takes the surrounding |
293 | /// context into consideration. For example, if the DFA is anchored, then |
294 | /// a match can only occur when `start == bytes.len()`. |
295 | #[inline (never)] |
296 | fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> { |
297 | if self.is_anchored() && start < bytes.len() { |
298 | return None; |
299 | } |
300 | |
301 | let mut state = self.start_state(); |
302 | let mut last_match = if self.is_dead_state(state) { |
303 | return None; |
304 | } else if self.is_match_state(state) { |
305 | Some(start) |
306 | } else { |
307 | None |
308 | }; |
309 | for (i, &b) in bytes[..start].iter().enumerate().rev() { |
310 | state = unsafe { self.next_state_unchecked(state, b) }; |
311 | if self.is_match_or_dead_state(state) { |
312 | if self.is_dead_state(state) { |
313 | return last_match; |
314 | } |
315 | last_match = Some(i); |
316 | } |
317 | } |
318 | last_match |
319 | } |
320 | } |
321 | |
322 | impl<'a, T: DFA> DFA for &'a T { |
323 | type ID = T::ID; |
324 | |
325 | #[inline ] |
326 | fn start_state(&self) -> Self::ID { |
327 | (**self).start_state() |
328 | } |
329 | |
330 | #[inline ] |
331 | fn is_match_state(&self, id: Self::ID) -> bool { |
332 | (**self).is_match_state(id) |
333 | } |
334 | |
335 | #[inline ] |
336 | fn is_match_or_dead_state(&self, id: Self::ID) -> bool { |
337 | (**self).is_match_or_dead_state(id) |
338 | } |
339 | |
340 | #[inline ] |
341 | fn is_dead_state(&self, id: Self::ID) -> bool { |
342 | (**self).is_dead_state(id) |
343 | } |
344 | |
345 | #[inline ] |
346 | fn is_anchored(&self) -> bool { |
347 | (**self).is_anchored() |
348 | } |
349 | |
350 | #[inline ] |
351 | fn next_state(&self, current: Self::ID, input: u8) -> Self::ID { |
352 | (**self).next_state(current, input) |
353 | } |
354 | |
355 | #[inline ] |
356 | unsafe fn next_state_unchecked( |
357 | &self, |
358 | current: Self::ID, |
359 | input: u8, |
360 | ) -> Self::ID { |
361 | (**self).next_state_unchecked(current, input) |
362 | } |
363 | } |
364 | |