1/// A state identifier specifically tailored for lazy DFAs.
2///
3/// A lazy state ID logically represents a pointer to a DFA state. In practice,
4/// by limiting the number of DFA states it can address, it reserves some
5/// bits of its representation to encode some additional information. That
6/// additional information is called a "tag." That tag is used to record
7/// whether the state it points to is an unknown, dead, quit, start or match
8/// state.
9///
10/// When implementing a low level search routine with a lazy DFA, it is
11/// necessary to query the type of the current state to know what to do:
12///
13/// * **Unknown** - The state has not yet been computed. The
14/// parameters used to get this state ID must be re-passed to
15/// [`DFA::next_state`](crate::hybrid::dfa::DFA::next_state), which will never
16/// return an unknown state ID.
17/// * **Dead** - A dead state only has transitions to itself. It indicates that
18/// the search cannot do anything else and should stop with whatever result it
19/// has.
20/// * **Quit** - A quit state indicates that the automaton could not answer
21/// whether a match exists or not. Correct search implementations must return a
22/// [`MatchError::quit`](crate::MatchError::quit) when a DFA enters a quit
23/// state.
24/// * **Start** - A start state is a state in which a search can begin.
25/// Lazy DFAs usually have more than one start state. Branching on
26/// this isn't required for correctness, but a common optimization is
27/// to run a prefilter when a search enters a start state. Note that
28/// start states are *not* tagged automatically, and one must enable the
29/// [`Config::specialize_start_states`](crate::hybrid::dfa::Config::specialize_start_states)
30/// setting for start states to be tagged. The reason for this is
31/// that a DFA search loop is usually written to execute a prefilter once it
32/// enters a start state. But if there is no prefilter, this handling can be
33/// quite diastrous as the DFA may ping-pong between the special handling code
34/// and a possible optimized hot path for handling untagged states. When start
35/// states aren't specialized, then they are untagged and remain in the hot
36/// path.
37/// * **Match** - A match state indicates that a match has been found.
38/// Depending on the semantics of your search implementation, it may either
39/// continue until the end of the haystack or a dead state, or it might quit
40/// and return the match immediately.
41///
42/// As an optimization, the [`is_tagged`](LazyStateID::is_tagged) predicate
43/// can be used to determine if a tag exists at all. This is useful to avoid
44/// branching on all of the above types for every byte searched.
45///
46/// # Example
47///
48/// This example shows how `LazyStateID` can be used to implement a correct
49/// search routine with minimal branching. In particular, this search routine
50/// implements "leftmost" matching, which means that it doesn't immediately
51/// stop once a match is found. Instead, it continues until it reaches a dead
52/// state.
53///
54/// Notice also how a correct search implementation deals with
55/// [`CacheError`](crate::hybrid::CacheError)s returned by some of
56/// the lazy DFA routines. When a `CacheError` occurs, it returns
57/// [`MatchError::gave_up`](crate::MatchError::gave_up).
58///
59/// ```
60/// use regex_automata::{
61/// hybrid::dfa::{Cache, DFA},
62/// HalfMatch, MatchError, Input,
63/// };
64///
65/// fn find_leftmost_first(
66/// dfa: &DFA,
67/// cache: &mut Cache,
68/// haystack: &[u8],
69/// ) -> Result<Option<HalfMatch>, MatchError> {
70/// // The start state is determined by inspecting the position and the
71/// // initial bytes of the haystack. Note that start states can never
72/// // be match states (since DFAs in this crate delay matches by 1
73/// // byte), so we don't need to check if the start state is a match.
74/// let mut sid = dfa.start_state_forward(
75/// cache,
76/// &Input::new(haystack),
77/// )?;
78/// let mut last_match = None;
79/// // Walk all the bytes in the haystack. We can quit early if we see
80/// // a dead or a quit state. The former means the automaton will
81/// // never transition to any other state. The latter means that the
82/// // automaton entered a condition in which its search failed.
83/// for (i, &b) in haystack.iter().enumerate() {
84/// sid = dfa
85/// .next_state(cache, sid, b)
86/// .map_err(|_| MatchError::gave_up(i))?;
87/// if sid.is_tagged() {
88/// if sid.is_match() {
89/// last_match = Some(HalfMatch::new(
90/// dfa.match_pattern(cache, sid, 0),
91/// i,
92/// ));
93/// } else if sid.is_dead() {
94/// return Ok(last_match);
95/// } else if sid.is_quit() {
96/// // It is possible to enter into a quit state after
97/// // observing a match has occurred. In that case, we
98/// // should return the match instead of an error.
99/// if last_match.is_some() {
100/// return Ok(last_match);
101/// }
102/// return Err(MatchError::quit(b, i));
103/// }
104/// // Implementors may also want to check for start states and
105/// // handle them differently for performance reasons. But it is
106/// // not necessary for correctness. Note that in order to check
107/// // for start states, you'll need to enable the
108/// // 'specialize_start_states' config knob, otherwise start
109/// // states will not be tagged.
110/// }
111/// }
112/// // Matches are always delayed by 1 byte, so we must explicitly walk
113/// // the special "EOI" transition at the end of the search.
114/// sid = dfa
115/// .next_eoi_state(cache, sid)
116/// .map_err(|_| MatchError::gave_up(haystack.len()))?;
117/// if sid.is_match() {
118/// last_match = Some(HalfMatch::new(
119/// dfa.match_pattern(cache, sid, 0),
120/// haystack.len(),
121/// ));
122/// }
123/// Ok(last_match)
124/// }
125///
126/// // We use a greedy '+' operator to show how the search doesn't just stop
127/// // once a match is detected. It continues extending the match. Using
128/// // '[a-z]+?' would also work as expected and stop the search early.
129/// // Greediness is built into the automaton.
130/// let dfa = DFA::new(r"[a-z]+")?;
131/// let mut cache = dfa.create_cache();
132/// let haystack = "123 foobar 4567".as_bytes();
133/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
134/// assert_eq!(mat.pattern().as_usize(), 0);
135/// assert_eq!(mat.offset(), 10);
136///
137/// // Here's another example that tests our handling of the special
138/// // EOI transition. This will fail to find a match if we don't call
139/// // 'next_eoi_state' at the end of the search since the match isn't found
140/// // until the final byte in the haystack.
141/// let dfa = DFA::new(r"[0-9]{4}")?;
142/// let mut cache = dfa.create_cache();
143/// let haystack = "123 foobar 4567".as_bytes();
144/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
145/// assert_eq!(mat.pattern().as_usize(), 0);
146/// assert_eq!(mat.offset(), 15);
147///
148/// // And note that our search implementation above automatically works
149/// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects
150/// // the appropriate pattern ID for us.
151/// let dfa = DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?;
152/// let mut cache = dfa.create_cache();
153/// let haystack = "123 foobar 4567".as_bytes();
154/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
155/// assert_eq!(mat.pattern().as_usize(), 1);
156/// assert_eq!(mat.offset(), 3);
157/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[3..])?.unwrap();
158/// assert_eq!(mat.pattern().as_usize(), 0);
159/// assert_eq!(mat.offset(), 7);
160/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[10..])?.unwrap();
161/// assert_eq!(mat.pattern().as_usize(), 1);
162/// assert_eq!(mat.offset(), 5);
163///
164/// # Ok::<(), Box<dyn std::error::Error>>(())
165/// ```
166#[derive(
167 Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
168)]
169pub struct LazyStateID(u32);
170
171impl LazyStateID {
172 #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
173 const MAX_BIT: usize = 31;
174
175 #[cfg(target_pointer_width = "16")]
176 const MAX_BIT: usize = 15;
177
178 const MASK_UNKNOWN: usize = 1 << (LazyStateID::MAX_BIT);
179 const MASK_DEAD: usize = 1 << (LazyStateID::MAX_BIT - 1);
180 const MASK_QUIT: usize = 1 << (LazyStateID::MAX_BIT - 2);
181 const MASK_START: usize = 1 << (LazyStateID::MAX_BIT - 3);
182 const MASK_MATCH: usize = 1 << (LazyStateID::MAX_BIT - 4);
183 const MAX: usize = LazyStateID::MASK_MATCH - 1;
184
185 /// Create a new lazy state ID.
186 ///
187 /// If the given identifier exceeds [`LazyStateID::MAX`], then this returns
188 /// an error.
189 #[inline]
190 pub(crate) fn new(id: usize) -> Result<LazyStateID, LazyStateIDError> {
191 if id > LazyStateID::MAX {
192 let attempted = u64::try_from(id).unwrap();
193 return Err(LazyStateIDError { attempted });
194 }
195 Ok(LazyStateID::new_unchecked(id))
196 }
197
198 /// Create a new lazy state ID without checking whether the given value
199 /// exceeds [`LazyStateID::MAX`].
200 ///
201 /// While this is unchecked, providing an incorrect value must never
202 /// sacrifice memory safety.
203 #[inline]
204 const fn new_unchecked(id: usize) -> LazyStateID {
205 // FIXME: Use as_u32() once const functions in traits are stable.
206 LazyStateID(id as u32)
207 }
208
209 /// Return this lazy state ID as an untagged `usize`.
210 ///
211 /// If this lazy state ID is tagged, then the usize returned is the state
212 /// ID without the tag. If the ID was not tagged, then the usize returned
213 /// is equivalent to the state ID.
214 #[inline]
215 pub(crate) fn as_usize_untagged(&self) -> usize {
216 self.as_usize_unchecked() & LazyStateID::MAX
217 }
218
219 /// Return this lazy state ID as its raw internal `usize` value, which may
220 /// be tagged (and thus greater than LazyStateID::MAX).
221 #[inline]
222 pub(crate) const fn as_usize_unchecked(&self) -> usize {
223 // FIXME: Use as_usize() once const functions in traits are stable.
224 self.0 as usize
225 }
226
227 #[inline]
228 pub(crate) const fn to_unknown(&self) -> LazyStateID {
229 LazyStateID::new_unchecked(
230 self.as_usize_unchecked() | LazyStateID::MASK_UNKNOWN,
231 )
232 }
233
234 #[inline]
235 pub(crate) const fn to_dead(&self) -> LazyStateID {
236 LazyStateID::new_unchecked(
237 self.as_usize_unchecked() | LazyStateID::MASK_DEAD,
238 )
239 }
240
241 #[inline]
242 pub(crate) const fn to_quit(&self) -> LazyStateID {
243 LazyStateID::new_unchecked(
244 self.as_usize_unchecked() | LazyStateID::MASK_QUIT,
245 )
246 }
247
248 /// Return this lazy state ID as a state ID that is tagged as a start
249 /// state.
250 #[inline]
251 pub(crate) const fn to_start(&self) -> LazyStateID {
252 LazyStateID::new_unchecked(
253 self.as_usize_unchecked() | LazyStateID::MASK_START,
254 )
255 }
256
257 /// Return this lazy state ID as a lazy state ID that is tagged as a match
258 /// state.
259 #[inline]
260 pub(crate) const fn to_match(&self) -> LazyStateID {
261 LazyStateID::new_unchecked(
262 self.as_usize_unchecked() | LazyStateID::MASK_MATCH,
263 )
264 }
265
266 /// Return true if and only if this lazy state ID is tagged.
267 ///
268 /// When a lazy state ID is tagged, then one can conclude that it is one
269 /// of a match, start, dead, quit or unknown state.
270 #[inline]
271 pub const fn is_tagged(&self) -> bool {
272 self.as_usize_unchecked() > LazyStateID::MAX
273 }
274
275 /// Return true if and only if this represents a lazy state ID that is
276 /// "unknown." That is, the state has not yet been created. When a caller
277 /// sees this state ID, it generally means that a state has to be computed
278 /// in order to proceed.
279 #[inline]
280 pub const fn is_unknown(&self) -> bool {
281 self.as_usize_unchecked() & LazyStateID::MASK_UNKNOWN > 0
282 }
283
284 /// Return true if and only if this represents a dead state. A dead state
285 /// is a state that can never transition to any other state except the
286 /// dead state. When a dead state is seen, it generally indicates that a
287 /// search should stop.
288 #[inline]
289 pub const fn is_dead(&self) -> bool {
290 self.as_usize_unchecked() & LazyStateID::MASK_DEAD > 0
291 }
292
293 /// Return true if and only if this represents a quit state. A quit state
294 /// is a state that is representationally equivalent to a dead state,
295 /// except it indicates the automaton has reached a point at which it can
296 /// no longer determine whether a match exists or not. In general, this
297 /// indicates an error during search and the caller must either pass this
298 /// error up or use a different search technique.
299 #[inline]
300 pub const fn is_quit(&self) -> bool {
301 self.as_usize_unchecked() & LazyStateID::MASK_QUIT > 0
302 }
303
304 /// Return true if and only if this lazy state ID has been tagged as a
305 /// start state.
306 ///
307 /// Note that if
308 /// [`Config::specialize_start_states`](crate::hybrid::dfa::Config) is
309 /// disabled (which is the default), then this will always return false
310 /// since start states won't be tagged.
311 #[inline]
312 pub const fn is_start(&self) -> bool {
313 self.as_usize_unchecked() & LazyStateID::MASK_START > 0
314 }
315
316 /// Return true if and only if this lazy state ID has been tagged as a
317 /// match state.
318 #[inline]
319 pub const fn is_match(&self) -> bool {
320 self.as_usize_unchecked() & LazyStateID::MASK_MATCH > 0
321 }
322}
323
324/// This error occurs when a lazy state ID could not be constructed.
325///
326/// This occurs when given an integer exceeding the maximum lazy state ID
327/// value.
328///
329/// When the `std` feature is enabled, this implements the `Error` trait.
330#[derive(Clone, Debug, Eq, PartialEq)]
331pub(crate) struct LazyStateIDError {
332 attempted: u64,
333}
334
335impl LazyStateIDError {
336 /// Returns the value that failed to constructed a lazy state ID.
337 pub(crate) fn attempted(&self) -> u64 {
338 self.attempted
339 }
340}
341
342#[cfg(feature = "std")]
343impl std::error::Error for LazyStateIDError {}
344
345impl core::fmt::Display for LazyStateIDError {
346 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
347 write!(
348 f,
349 "failed to create LazyStateID from {:?}, which exceeds {:?}",
350 self.attempted(),
351 LazyStateID::MAX,
352 )
353 }
354}
355