1 | use state_id::StateID; |
---|---|

2 | |

3 | /// A trait describing the interface of a deterministic finite automaton (DFA). |

4 | /// |

5 | /// Every DFA has exactly one start state and at least one dead state (which |

6 | /// may be the same, as in the case of an empty DFA). In all cases, a state |

7 | /// identifier of `0` must be a dead state such that `DFA::is_dead_state(0)` |

8 | /// always returns `true`. |

9 | /// |

10 | /// Every DFA also has zero or more match states, such that |

11 | /// `DFA::is_match_state(id)` returns `true` if and only if `id` corresponds to |

12 | /// a match state. |

13 | /// |

14 | /// In general, users of this trait likely will only need to use the search |

15 | /// routines such as `is_match`, `shortest_match`, `find` or `rfind`. The other |

16 | /// methods are lower level and are used for walking the transitions of a DFA |

17 | /// manually. In particular, the aforementioned search routines are implemented |

18 | /// generically in terms of the lower level transition walking routines. |

19 | pub trait DFA { |

20 | /// The representation used for state identifiers in this DFA. |

21 | /// |

22 | /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`. |

23 | type ID: StateID; |

24 | |

25 | /// Return the identifier of this DFA's start state. |

26 | fn start_state(&self) -> Self::ID; |

27 | |

28 | /// Returns true if and only if the given identifier corresponds to a match |

29 | /// state. |

30 | fn is_match_state(&self, id: Self::ID) -> bool; |

31 | |

32 | /// Returns true if and only if the given identifier corresponds to a dead |

33 | /// state. When a DFA enters a dead state, it is impossible to leave and |

34 | /// thus can never lead to a match. |

35 | fn is_dead_state(&self, id: Self::ID) -> bool; |

36 | |

37 | /// Returns true if and only if the given identifier corresponds to either |

38 | /// a dead state or a match state, such that one of `is_match_state(id)` |

39 | /// or `is_dead_state(id)` must return true. |

40 | /// |

41 | /// Depending on the implementation of the DFA, this routine can be used |

42 | /// to save a branch in the core matching loop. Nevertheless, |

43 | /// `is_match_state(id) || is_dead_state(id)` is always a valid |

44 | /// implementation. |

45 | fn is_match_or_dead_state(&self, id: Self::ID) -> bool; |

46 | |

47 | /// Returns true if and only if this DFA is anchored. |

48 | /// |

49 | /// When a DFA is anchored, it is only allowed to report matches that |

50 | /// start at index `0`. |

51 | fn is_anchored(&self) -> bool; |

52 | |

53 | /// Given the current state that this DFA is in and the next input byte, |

54 | /// this method returns the identifier of the next state. The identifier |

55 | /// returned is always valid, but it may correspond to a dead state. |

56 | fn next_state(&self, current: Self::ID, input: u8) -> Self::ID; |

57 | |

58 | /// Like `next_state`, but its implementation may look up the next state |

59 | /// without memory safety checks such as bounds checks. As such, callers |

60 | /// must ensure that the given identifier corresponds to a valid DFA |

61 | /// state. Implementors must, in turn, ensure that this routine is safe |

62 | /// for all valid state identifiers and for all possible `u8` values. |

63 | unsafe fn next_state_unchecked( |

64 | &self, |

65 | current: Self::ID, |

66 | input: u8, |

67 | ) -> Self::ID; |

68 | |

69 | /// Returns true if and only if the given bytes match this DFA. |

70 | /// |

71 | /// This routine may short circuit if it knows that scanning future input |

72 | /// will never lead to a different result. In particular, if a DFA enters |

73 | /// a match state or a dead state, then this routine will return `true` or |

74 | /// `false`, respectively, without inspecting any future input. |

75 | /// |

76 | /// # Example |

77 | /// |

78 | /// This example shows how to use this method with a |

79 | /// [`DenseDFA`](enum.DenseDFA.html). |

80 | /// |

81 | /// ``` |

82 | /// use regex_automata::{DFA, DenseDFA}; |

83 | /// |

84 | /// # fn example() -> Result<(), regex_automata::Error> { |

85 | /// let dfa = DenseDFA::new("foo[0-9]+bar")?; |

86 | /// assert_eq!(true, dfa.is_match(b"foo12345bar")); |

87 | /// assert_eq!(false, dfa.is_match(b"foobar")); |

88 | /// # Ok(()) }; example().unwrap() |

89 | /// ``` |

90 | #[inline] |

91 | fn is_match(&self, bytes: &[u8]) -> bool { |

92 | self.is_match_at(bytes, 0) |

93 | } |

94 | |

95 | /// Returns the first position at which a match is found. |

96 | /// |

97 | /// This routine stops scanning input in precisely the same circumstances |

98 | /// as `is_match`. The key difference is that this routine returns the |

99 | /// position at which it stopped scanning input if and only if a match |

100 | /// was found. If no match is found, then `None` is returned. |

101 | /// |

102 | /// # Example |

103 | /// |

104 | /// This example shows how to use this method with a |

105 | /// [`DenseDFA`](enum.DenseDFA.html). |

106 | /// |

107 | /// ``` |

108 | /// use regex_automata::{DFA, DenseDFA}; |

109 | /// |

110 | /// # fn example() -> Result<(), regex_automata::Error> { |

111 | /// let dfa = DenseDFA::new("foo[0-9]+")?; |

112 | /// assert_eq!(Some(4), dfa.shortest_match(b"foo12345")); |

113 | /// |

114 | /// // Normally, the end of the leftmost first match here would be 3, |

115 | /// // but the shortest match semantics detect a match earlier. |

116 | /// let dfa = DenseDFA::new("abc|a")?; |

117 | /// assert_eq!(Some(1), dfa.shortest_match(b"abc")); |

118 | /// # Ok(()) }; example().unwrap() |

119 | /// ``` |

120 | #[inline] |

121 | fn shortest_match(&self, bytes: &[u8]) -> Option<usize> { |

122 | self.shortest_match_at(bytes, 0) |

123 | } |

124 | |

125 | /// Returns the end offset of the longest match. If no match exists, |

126 | /// then `None` is returned. |

127 | /// |

128 | /// Implementors of this trait are not required to implement any particular |

129 | /// match semantics (such as leftmost-first), which are instead manifest in |

130 | /// the DFA's topology itself. |

131 | /// |

132 | /// In particular, this method must continue searching even after it |

133 | /// enters a match state. The search should only terminate once it has |

134 | /// reached the end of the input or when it has entered a dead state. Upon |

135 | /// termination, the position of the last byte seen while still in a match |

136 | /// state is returned. |

137 | /// |

138 | /// # Example |

139 | /// |

140 | /// This example shows how to use this method with a |

141 | /// [`DenseDFA`](enum.DenseDFA.html). By default, a dense DFA uses |

142 | /// "leftmost first" match semantics. |

143 | /// |

144 | /// Leftmost first match semantics corresponds to the match with the |

145 | /// smallest starting offset, but where the end offset is determined by |

146 | /// preferring earlier branches in the original regular expression. For |

147 | /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` |

148 | /// will match `Samwise` in `Samwise`. |

149 | /// |

150 | /// Generally speaking, the "leftmost first" match is how most backtracking |

151 | /// regular expressions tend to work. This is in contrast to POSIX-style |

152 | /// regular expressions that yield "leftmost longest" matches. Namely, |

153 | /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using |

154 | /// leftmost longest semantics. |

155 | /// |

156 | /// ``` |

157 | /// use regex_automata::{DFA, DenseDFA}; |

158 | /// |

159 | /// # fn example() -> Result<(), regex_automata::Error> { |

160 | /// let dfa = DenseDFA::new("foo[0-9]+")?; |

161 | /// assert_eq!(Some(8), dfa.find(b"foo12345")); |

162 | /// |

163 | /// // Even though a match is found after reading the first byte (`a`), |

164 | /// // the leftmost first match semantics demand that we find the earliest |

165 | /// // match that prefers earlier parts of the pattern over latter parts. |

166 | /// let dfa = DenseDFA::new("abc|a")?; |

167 | /// assert_eq!(Some(3), dfa.find(b"abc")); |

168 | /// # Ok(()) }; example().unwrap() |

169 | /// ``` |

170 | #[inline] |

171 | fn find(&self, bytes: &[u8]) -> Option<usize> { |

172 | self.find_at(bytes, 0) |

173 | } |

174 | |

175 | /// Returns the start offset of the longest match in reverse, by searching |

176 | /// from the end of the input towards the start of the input. If no match |

177 | /// exists, then `None` is returned. In other words, this has the same |

178 | /// match semantics as `find`, but in reverse. |

179 | /// |

180 | /// # Example |

181 | /// |

182 | /// This example shows how to use this method with a |

183 | /// [`DenseDFA`](enum.DenseDFA.html). In particular, this routine |

184 | /// is principally useful when used in conjunction with the |

185 | /// [`dense::Builder::reverse`](dense/struct.Builder.html#method.reverse) |

186 | /// configuration knob. In general, it's unlikely to be correct to use both |

187 | /// `find` and `rfind` with the same DFA since any particular DFA will only |

188 | /// support searching in one direction. |

189 | /// |

190 | /// ``` |

191 | /// use regex_automata::{dense, DFA}; |

192 | /// |

193 | /// # fn example() -> Result<(), regex_automata::Error> { |

194 | /// let dfa = dense::Builder::new().reverse(true).build("foo[0-9]+")?; |

195 | /// assert_eq!(Some(0), dfa.rfind(b"foo12345")); |

196 | /// # Ok(()) }; example().unwrap() |

197 | /// ``` |

198 | #[inline] |

199 | fn rfind(&self, bytes: &[u8]) -> Option<usize> { |

200 | self.rfind_at(bytes, bytes.len()) |

201 | } |

202 | |

203 | /// Returns the same as `is_match`, but starts the search at the given |

204 | /// offset. |

205 | /// |

206 | /// The significance of the starting point is that it takes the surrounding |

207 | /// context into consideration. For example, if the DFA is anchored, then |

208 | /// a match can only occur when `start == 0`. |

209 | #[inline] |

210 | fn is_match_at(&self, bytes: &[u8], start: usize) -> bool { |

211 | if self.is_anchored() && start > 0 { |

212 | return false; |

213 | } |

214 | |

215 | let mut state = self.start_state(); |

216 | if self.is_match_or_dead_state(state) { |

217 | return self.is_match_state(state); |

218 | } |

219 | for &b in bytes[start..].iter() { |

220 | state = unsafe { self.next_state_unchecked(state, b) }; |

221 | if self.is_match_or_dead_state(state) { |

222 | return self.is_match_state(state); |

223 | } |

224 | } |

225 | false |

226 | } |

227 | |

228 | /// Returns the same as `shortest_match`, but starts the search at the |

229 | /// given offset. |

230 | /// |

231 | /// The significance of the starting point is that it takes the surrounding |

232 | /// context into consideration. For example, if the DFA is anchored, then |

233 | /// a match can only occur when `start == 0`. |

234 | #[inline] |

235 | fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> { |

236 | if self.is_anchored() && start > 0 { |

237 | return None; |

238 | } |

239 | |

240 | let mut state = self.start_state(); |

241 | if self.is_match_or_dead_state(state) { |

242 | return if self.is_dead_state(state) { None } else { Some(start) }; |

243 | } |

244 | for (i, &b) in bytes[start..].iter().enumerate() { |

245 | state = unsafe { self.next_state_unchecked(state, b) }; |

246 | if self.is_match_or_dead_state(state) { |

247 | return if self.is_dead_state(state) { |

248 | None |

249 | } else { |

250 | Some(start + i + 1) |

251 | }; |

252 | } |

253 | } |

254 | None |

255 | } |

256 | |

257 | /// Returns the same as `find`, but starts the search at the given |

258 | /// offset. |

259 | /// |

260 | /// The significance of the starting point is that it takes the surrounding |

261 | /// context into consideration. For example, if the DFA is anchored, then |

262 | /// a match can only occur when `start == 0`. |

263 | #[inline] |

264 | fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> { |

265 | if self.is_anchored() && start > 0 { |

266 | return None; |

267 | } |

268 | |

269 | let mut state = self.start_state(); |

270 | let mut last_match = if self.is_dead_state(state) { |

271 | return None; |

272 | } else if self.is_match_state(state) { |

273 | Some(start) |

274 | } else { |

275 | None |

276 | }; |

277 | for (i, &b) in bytes[start..].iter().enumerate() { |

278 | state = unsafe { self.next_state_unchecked(state, b) }; |

279 | if self.is_match_or_dead_state(state) { |

280 | if self.is_dead_state(state) { |

281 | return last_match; |

282 | } |

283 | last_match = Some(start + i + 1); |

284 | } |

285 | } |

286 | last_match |

287 | } |

288 | |

289 | /// Returns the same as `rfind`, but starts the search at the given |

290 | /// offset. |

291 | /// |

292 | /// The significance of the starting point is that it takes the surrounding |

293 | /// context into consideration. For example, if the DFA is anchored, then |

294 | /// a match can only occur when `start == bytes.len()`. |

295 | #[inline(never)] |

296 | fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> { |

297 | if self.is_anchored() && start < bytes.len() { |

298 | return None; |

299 | } |

300 | |

301 | let mut state = self.start_state(); |

302 | let mut last_match = if self.is_dead_state(state) { |

303 | return None; |

304 | } else if self.is_match_state(state) { |

305 | Some(start) |

306 | } else { |

307 | None |

308 | }; |

309 | for (i, &b) in bytes[..start].iter().enumerate().rev() { |

310 | state = unsafe { self.next_state_unchecked(state, b) }; |

311 | if self.is_match_or_dead_state(state) { |

312 | if self.is_dead_state(state) { |

313 | return last_match; |

314 | } |

315 | last_match = Some(i); |

316 | } |

317 | } |

318 | last_match |

319 | } |

320 | } |

321 | |

322 | impl<'a, T: DFA> DFA for &'a T { |

323 | type ID = T::ID; |

324 | |

325 | #[inline] |

326 | fn start_state(&self) -> Self::ID { |

327 | (**self).start_state() |

328 | } |

329 | |

330 | #[inline] |

331 | fn is_match_state(&self, id: Self::ID) -> bool { |

332 | (**self).is_match_state(id) |

333 | } |

334 | |

335 | #[inline] |

336 | fn is_match_or_dead_state(&self, id: Self::ID) -> bool { |

337 | (**self).is_match_or_dead_state(id) |

338 | } |

339 | |

340 | #[inline] |

341 | fn is_dead_state(&self, id: Self::ID) -> bool { |

342 | (**self).is_dead_state(id) |

343 | } |

344 | |

345 | #[inline] |

346 | fn is_anchored(&self) -> bool { |

347 | (**self).is_anchored() |

348 | } |

349 | |

350 | #[inline] |

351 | fn next_state(&self, current: Self::ID, input: u8) -> Self::ID { |

352 | (**self).next_state(current, input) |

353 | } |

354 | |

355 | #[inline] |

356 | unsafe fn next_state_unchecked( |

357 | &self, |

358 | current: Self::ID, |

359 | input: u8, |

360 | ) -> Self::ID { |

361 | (**self).next_state_unchecked(current, input) |

362 | } |

363 | } |

364 |