1/*!
2Provides helpers for dealing with start state configurations in DFAs.
3*/
4
5use crate::util::{
6 look::LookMatcher,
7 search::{Anchored, Input},
8 wire::{self, DeserializeError, SerializeError},
9};
10
11/// The configuration used to determine a DFA's start state for a search.
12///
13/// A DFA has a single starting state in the typical textbook description. That
14/// is, it corresponds to the set of all starting states for the NFA that built
15/// it, along with their espsilon closures. In this crate, however, DFAs have
16/// many possible start states due to a few factors:
17///
18/// * DFAs support the ability to run either anchored or unanchored searches.
19/// Each type of search needs its own start state. For example, an unanchored
20/// search requires starting at a state corresponding to a regex with a
21/// `(?s-u:.)*?` prefix, which will match through anything.
22/// * DFAs also optionally support starting an anchored search for any one
23/// specific pattern. Each such pattern requires its own start state.
24/// * If a look-behind assertion like `^` or `\b` is used in the regex, then
25/// the DFA will need to inspect a single byte immediately before the start of
26/// the search to choose the correct start state.
27///
28/// Indeed, this configuration precisely encapsulates all of the above factors.
29/// The [`Config::anchored`] method sets which kind of anchored search to
30/// perform while the [`Config::look_behind`] method provides a way to set
31/// the byte that occurs immediately before the start of the search.
32///
33/// Generally speaking, this type is only useful when you want to run searches
34/// without using an [`Input`]. In particular, an `Input` wants a haystack
35/// slice, but callers may not have a contiguous sequence of bytes as a
36/// haystack in all cases. This type provides a lower level of control such
37/// that callers can provide their own anchored configuration and look-behind
38/// byte explicitly.
39///
40/// # Example
41///
42/// This shows basic usage that permits running a search with a DFA without
43/// using the `Input` abstraction.
44///
45/// ```
46/// use regex_automata::{
47/// dfa::{Automaton, dense},
48/// util::start,
49/// Anchored,
50/// };
51///
52/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?;
53/// let haystack = "quartz";
54///
55/// let config = start::Config::new().anchored(Anchored::Yes);
56/// let mut state = dfa.start_state(&config)?;
57/// for &b in haystack.as_bytes().iter() {
58/// state = dfa.next_state(state, b);
59/// }
60/// state = dfa.next_eoi_state(state);
61/// assert!(dfa.is_match_state(state));
62///
63/// # Ok::<(), Box<dyn std::error::Error>>(())
64/// ```
65///
66/// This example shows how to correctly run a search that doesn't begin at
67/// the start of a haystack. Notice how we set the look-behind byte, and as
68/// a result, the `\b` assertion does not match.
69///
70/// ```
71/// use regex_automata::{
72/// dfa::{Automaton, dense},
73/// util::start,
74/// Anchored,
75/// };
76///
77/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?;
78/// let haystack = "quartz";
79///
80/// let config = start::Config::new()
81/// .anchored(Anchored::Yes)
82/// .look_behind(Some(b'q'));
83/// let mut state = dfa.start_state(&config)?;
84/// for &b in haystack.as_bytes().iter().skip(1) {
85/// state = dfa.next_state(state, b);
86/// }
87/// state = dfa.next_eoi_state(state);
88/// // No match!
89/// assert!(!dfa.is_match_state(state));
90///
91/// # Ok::<(), Box<dyn std::error::Error>>(())
92/// ```
93///
94/// If we had instead not set a look-behind byte, then the DFA would assume
95/// that it was starting at the beginning of the haystack, and thus `\b` should
96/// match. This in turn would result in erroneously reporting a match:
97///
98/// ```
99/// use regex_automata::{
100/// dfa::{Automaton, dense},
101/// util::start,
102/// Anchored,
103/// };
104///
105/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?;
106/// let haystack = "quartz";
107///
108/// // Whoops, forgot the look-behind byte...
109/// let config = start::Config::new().anchored(Anchored::Yes);
110/// let mut state = dfa.start_state(&config)?;
111/// for &b in haystack.as_bytes().iter().skip(1) {
112/// state = dfa.next_state(state, b);
113/// }
114/// state = dfa.next_eoi_state(state);
115/// // And now we get a match unexpectedly.
116/// assert!(dfa.is_match_state(state));
117///
118/// # Ok::<(), Box<dyn std::error::Error>>(())
119/// ```
120#[derive(Clone, Debug)]
121pub struct Config {
122 look_behind: Option<u8>,
123 anchored: Anchored,
124}
125
126impl Config {
127 /// Create a new default start configuration.
128 ///
129 /// The default is an unanchored search that starts at the beginning of the
130 /// haystack.
131 pub fn new() -> Config {
132 Config { anchored: Anchored::No, look_behind: None }
133 }
134
135 /// A convenience routine for building a start configuration from an
136 /// [`Input`] for a forward search.
137 ///
138 /// This automatically sets the look-behind byte to the byte immediately
139 /// preceding the start of the search. If the start of the search is at
140 /// offset `0`, then no look-behind byte is set.
141 pub fn from_input_forward(input: &Input<'_>) -> Config {
142 let look_behind = input
143 .start()
144 .checked_sub(1)
145 .and_then(|i| input.haystack().get(i).copied());
146 Config { look_behind, anchored: input.get_anchored() }
147 }
148
149 /// A convenience routine for building a start configuration from an
150 /// [`Input`] for a reverse search.
151 ///
152 /// This automatically sets the look-behind byte to the byte immediately
153 /// following the end of the search. If the end of the search is at
154 /// offset `haystack.len()`, then no look-behind byte is set.
155 pub fn from_input_reverse(input: &Input<'_>) -> Config {
156 let look_behind = input.haystack().get(input.end()).copied();
157 Config { look_behind, anchored: input.get_anchored() }
158 }
159
160 /// Set the look-behind byte at the start of a search.
161 ///
162 /// Unless the search is intended to logically start at the beginning of a
163 /// haystack, this should _always_ be set to the byte immediately preceding
164 /// the start of the search. If no look-behind byte is set, then the start
165 /// configuration will assume it is at the beginning of the haystack. For
166 /// example, the anchor `^` will match.
167 ///
168 /// The default is that no look-behind byte is set.
169 pub fn look_behind(mut self, byte: Option<u8>) -> Config {
170 self.look_behind = byte;
171 self
172 }
173
174 /// Set the anchored mode of a search.
175 ///
176 /// The default is an unanchored search.
177 pub fn anchored(mut self, mode: Anchored) -> Config {
178 self.anchored = mode;
179 self
180 }
181
182 /// Return the look-behind byte in this configuration, if one exists.
183 pub fn get_look_behind(&self) -> Option<u8> {
184 self.look_behind
185 }
186
187 /// Return the anchored mode in this configuration.
188 pub fn get_anchored(&self) -> Anchored {
189 self.anchored
190 }
191}
192
193/// A map from every possible byte value to its corresponding starting
194/// configuration.
195///
196/// This map is used in order to lookup the start configuration for a particular
197/// position in a haystack. This start configuration is then used in
198/// combination with things like the anchored mode and pattern ID to fully
199/// determine the start state.
200///
201/// Generally speaking, this map is only used for fully compiled DFAs and lazy
202/// DFAs. For NFAs (including the one-pass DFA), the start state is generally
203/// selected by virtue of traversing the NFA state graph. DFAs do the same
204/// thing, but at build time and not search time. (Well, technically the lazy
205/// DFA does it at search time, but it does enough work to cache the full
206/// result of the epsilon closure that the NFA engines tend to need to do.)
207#[derive(Clone)]
208pub(crate) struct StartByteMap {
209 map: [Start; 256],
210}
211
212impl StartByteMap {
213 /// Create a new map from byte values to their corresponding starting
214 /// configurations. The map is determined, in part, by how look-around
215 /// assertions are matched via the matcher given.
216 pub(crate) fn new(lookm: &LookMatcher) -> StartByteMap {
217 let mut map = [Start::NonWordByte; 256];
218 map[usize::from(b'\n')] = Start::LineLF;
219 map[usize::from(b'\r')] = Start::LineCR;
220 map[usize::from(b'_')] = Start::WordByte;
221
222 let mut byte = b'0';
223 while byte <= b'9' {
224 map[usize::from(byte)] = Start::WordByte;
225 byte += 1;
226 }
227 byte = b'A';
228 while byte <= b'Z' {
229 map[usize::from(byte)] = Start::WordByte;
230 byte += 1;
231 }
232 byte = b'a';
233 while byte <= b'z' {
234 map[usize::from(byte)] = Start::WordByte;
235 byte += 1;
236 }
237
238 let lineterm = lookm.get_line_terminator();
239 // If our line terminator is normal, then it is already handled by
240 // the LineLF and LineCR configurations. But if it's weird, then we
241 // overwrite whatever was there before for that terminator with a
242 // special configuration. The trick here is that if the terminator
243 // is, say, a word byte like `a`, then callers seeing this start
244 // configuration need to account for that and build their DFA state as
245 // if it *also* came from a word byte.
246 if lineterm != b'\r' && lineterm != b'\n' {
247 map[usize::from(lineterm)] = Start::CustomLineTerminator;
248 }
249 StartByteMap { map }
250 }
251
252 /// Return the starting configuration for the given look-behind byte.
253 ///
254 /// If no look-behind exists, callers should use `Start::Text`.
255 #[cfg_attr(feature = "perf-inline", inline(always))]
256 pub(crate) fn get(&self, byte: u8) -> Start {
257 self.map[usize::from(byte)]
258 }
259
260 /// Deserializes a byte class map from the given slice. If the slice is of
261 /// insufficient length or otherwise contains an impossible mapping, then
262 /// an error is returned. Upon success, the number of bytes read along with
263 /// the map are returned. The number of bytes read is always a multiple of
264 /// 8.
265 pub(crate) fn from_bytes(
266 slice: &[u8],
267 ) -> Result<(StartByteMap, usize), DeserializeError> {
268 wire::check_slice_len(slice, 256, "start byte map")?;
269 let mut map = [Start::NonWordByte; 256];
270 for (i, &repr) in slice[..256].iter().enumerate() {
271 map[i] = match Start::from_usize(usize::from(repr)) {
272 Some(start) => start,
273 None => {
274 return Err(DeserializeError::generic(
275 "found invalid starting configuration",
276 ))
277 }
278 };
279 }
280 Ok((StartByteMap { map }, 256))
281 }
282
283 /// Writes this map to the given byte buffer. if the given buffer is too
284 /// small, then an error is returned. Upon success, the total number of
285 /// bytes written is returned. The number of bytes written is guaranteed to
286 /// be a multiple of 8.
287 pub(crate) fn write_to(
288 &self,
289 dst: &mut [u8],
290 ) -> Result<usize, SerializeError> {
291 let nwrite = self.write_to_len();
292 if dst.len() < nwrite {
293 return Err(SerializeError::buffer_too_small("start byte map"));
294 }
295 for (i, &start) in self.map.iter().enumerate() {
296 dst[i] = start.as_u8();
297 }
298 Ok(nwrite)
299 }
300
301 /// Returns the total number of bytes written by `write_to`.
302 pub(crate) fn write_to_len(&self) -> usize {
303 256
304 }
305}
306
307impl core::fmt::Debug for StartByteMap {
308 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
309 use crate::util::escape::DebugByte;
310
311 write!(f, "StartByteMap{{")?;
312 for byte in 0..=255 {
313 if byte > 0 {
314 write!(f, ", ")?;
315 }
316 let start = self.map[usize::from(byte)];
317 write!(f, "{:?} => {:?}", DebugByte(byte), start)?;
318 }
319 write!(f, "}}")?;
320 Ok(())
321 }
322}
323
324/// Represents the six possible starting configurations of a DFA search.
325///
326/// The starting configuration is determined by inspecting the the beginning
327/// of the haystack (up to 1 byte). Ultimately, this along with a pattern ID
328/// (if specified) and the type of search (anchored or not) is what selects the
329/// start state to use in a DFA.
330///
331/// As one example, if a DFA only supports unanchored searches and does not
332/// support anchored searches for each pattern, then it will have at most 6
333/// distinct start states. (Some start states may be reused if determinization
334/// can determine that they will be equivalent.) If the DFA supports both
335/// anchored and unanchored searches, then it will have a maximum of 12
336/// distinct start states. Finally, if the DFA also supports anchored searches
337/// for each pattern, then it can have up to `12 + (N * 6)` start states, where
338/// `N` is the number of patterns.
339///
340/// Handling each of these starting configurations in the context of DFA
341/// determinization can be *quite* tricky and subtle. But the code is small
342/// and can be found at `crate::util::determinize::set_lookbehind_from_start`.
343#[derive(Clone, Copy, Debug, Eq, PartialEq)]
344pub(crate) enum Start {
345 /// This occurs when the starting position is not any of the ones below.
346 NonWordByte = 0,
347 /// This occurs when the byte immediately preceding the start of the search
348 /// is an ASCII word byte.
349 WordByte = 1,
350 /// This occurs when the starting position of the search corresponds to the
351 /// beginning of the haystack.
352 Text = 2,
353 /// This occurs when the byte immediately preceding the start of the search
354 /// is a line terminator. Specifically, `\n`.
355 LineLF = 3,
356 /// This occurs when the byte immediately preceding the start of the search
357 /// is a line terminator. Specifically, `\r`.
358 LineCR = 4,
359 /// This occurs when a custom line terminator has been set via a
360 /// `LookMatcher`, and when that line terminator is neither a `\r` or a
361 /// `\n`.
362 ///
363 /// If the custom line terminator is a word byte, then this start
364 /// configuration is still selected. DFAs that implement word boundary
365 /// assertions will likely need to check whether the custom line terminator
366 /// is a word byte, in which case, it should behave as if the byte
367 /// satisfies `\b` in addition to multi-line anchors.
368 CustomLineTerminator = 5,
369}
370
371impl Start {
372 /// Return the starting state corresponding to the given integer. If no
373 /// starting state exists for the given integer, then None is returned.
374 pub(crate) fn from_usize(n: usize) -> Option<Start> {
375 match n {
376 0 => Some(Start::NonWordByte),
377 1 => Some(Start::WordByte),
378 2 => Some(Start::Text),
379 3 => Some(Start::LineLF),
380 4 => Some(Start::LineCR),
381 5 => Some(Start::CustomLineTerminator),
382 _ => None,
383 }
384 }
385
386 /// Returns the total number of starting state configurations.
387 pub(crate) fn len() -> usize {
388 6
389 }
390
391 /// Return this starting configuration as `u8` integer. It is guaranteed to
392 /// be less than `Start::len()`.
393 #[cfg_attr(feature = "perf-inline", inline(always))]
394 pub(crate) fn as_u8(&self) -> u8 {
395 // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
396 // actual int.
397 *self as u8
398 }
399
400 /// Return this starting configuration as a `usize` integer. It is
401 /// guaranteed to be less than `Start::len()`.
402 #[cfg_attr(feature = "perf-inline", inline(always))]
403 pub(crate) fn as_usize(&self) -> usize {
404 usize::from(self.as_u8())
405 }
406}
407
408#[cfg(test)]
409mod tests {
410 use super::*;
411
412 #[test]
413 fn start_fwd_done_range() {
414 let smap = StartByteMap::new(&LookMatcher::default());
415 let input = Input::new("").range(1..0);
416 let config = Config::from_input_forward(&input);
417 let start =
418 config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
419 assert_eq!(Start::Text, start);
420 }
421
422 #[test]
423 fn start_rev_done_range() {
424 let smap = StartByteMap::new(&LookMatcher::default());
425 let input = Input::new("").range(1..0);
426 let config = Config::from_input_reverse(&input);
427 let start =
428 config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
429 assert_eq!(Start::Text, start);
430 }
431
432 #[test]
433 fn start_fwd() {
434 let f = |haystack, start, end| {
435 let smap = StartByteMap::new(&LookMatcher::default());
436 let input = Input::new(haystack).range(start..end);
437 let config = Config::from_input_forward(&input);
438 let start =
439 config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
440 start
441 };
442
443 assert_eq!(Start::Text, f("", 0, 0));
444 assert_eq!(Start::Text, f("abc", 0, 3));
445 assert_eq!(Start::Text, f("\nabc", 0, 3));
446
447 assert_eq!(Start::LineLF, f("\nabc", 1, 3));
448
449 assert_eq!(Start::LineCR, f("\rabc", 1, 3));
450
451 assert_eq!(Start::WordByte, f("abc", 1, 3));
452
453 assert_eq!(Start::NonWordByte, f(" abc", 1, 3));
454 }
455
456 #[test]
457 fn start_rev() {
458 let f = |haystack, start, end| {
459 let smap = StartByteMap::new(&LookMatcher::default());
460 let input = Input::new(haystack).range(start..end);
461 let config = Config::from_input_reverse(&input);
462 let start =
463 config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
464 start
465 };
466
467 assert_eq!(Start::Text, f("", 0, 0));
468 assert_eq!(Start::Text, f("abc", 0, 3));
469 assert_eq!(Start::Text, f("abc\n", 0, 4));
470
471 assert_eq!(Start::LineLF, f("abc\nz", 0, 3));
472
473 assert_eq!(Start::LineCR, f("abc\rz", 0, 3));
474
475 assert_eq!(Start::WordByte, f("abc", 0, 2));
476
477 assert_eq!(Start::NonWordByte, f("abc ", 0, 3));
478 }
479}
480