1use std::collections::HashMap;
2use std::mem;
3use std::rc::Rc;
4
5use dense;
6use error::Result;
7use nfa::{self, NFA};
8use sparse_set::SparseSet;
9use state_id::{dead_id, StateID};
10
11type DFARepr<S> = dense::Repr<Vec<S>, S>;
12
13/// A determinizer converts an NFA to a DFA.
14///
15/// This determinizer follows the typical powerset construction, where each
16/// DFA state is comprised of one or more NFA states. In the worst case, there
17/// is one DFA state for every possible combination of NFA states. In practice,
18/// this only happens in certain conditions, typically when there are bounded
19/// repetitions.
20///
21/// The type variable `S` refers to the chosen state identifier representation
22/// used for the DFA.
23///
24/// The lifetime variable `'a` refers to the lifetime of the NFA being
25/// converted to a DFA.
26#[derive(Debug)]
27pub(crate) struct Determinizer<'a, S: StateID> {
28 /// The NFA we're converting into a DFA.
29 nfa: &'a NFA,
30 /// The DFA we're building.
31 dfa: DFARepr<S>,
32 /// Each DFA state being built is defined as an *ordered* set of NFA
33 /// states, along with a flag indicating whether the state is a match
34 /// state or not.
35 ///
36 /// This is never empty. The first state is always a dummy state such that
37 /// a state id == 0 corresponds to a dead state.
38 builder_states: Vec<Rc<State>>,
39 /// A cache of DFA states that already exist and can be easily looked up
40 /// via ordered sets of NFA states.
41 cache: HashMap<Rc<State>, S>,
42 /// Scratch space for a stack of NFA states to visit, for depth first
43 /// visiting without recursion.
44 stack: Vec<nfa::StateID>,
45 /// Scratch space for storing an ordered sequence of NFA states, for
46 /// amortizing allocation.
47 scratch_nfa_states: Vec<nfa::StateID>,
48 /// Whether to build a DFA that finds the longest possible match.
49 longest_match: bool,
50}
51
52/// An intermediate representation for a DFA state during determinization.
53#[derive(Debug, Eq, Hash, PartialEq)]
54struct State {
55 /// Whether this state is a match state or not.
56 is_match: bool,
57 /// An ordered sequence of NFA states that make up this DFA state.
58 nfa_states: Vec<nfa::StateID>,
59}
60
61impl<'a, S: StateID> Determinizer<'a, S> {
62 /// Create a new determinizer for converting the given NFA to a DFA.
63 pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> {
64 let dead = Rc::new(State::dead());
65 let mut cache = HashMap::default();
66 cache.insert(dead.clone(), dead_id());
67
68 Determinizer {
69 nfa,
70 dfa: DFARepr::empty().anchored(nfa.is_anchored()),
71 builder_states: vec![dead],
72 cache,
73 stack: vec![],
74 scratch_nfa_states: vec![],
75 longest_match: false,
76 }
77 }
78
79 /// Instruct the determinizer to use equivalence classes as the transition
80 /// alphabet instead of all possible byte values.
81 pub fn with_byte_classes(mut self) -> Determinizer<'a, S> {
82 let byte_classes = self.nfa.byte_classes().clone();
83 self.dfa = DFARepr::empty_with_byte_classes(byte_classes)
84 .anchored(self.nfa.is_anchored());
85 self
86 }
87
88 /// Instruct the determinizer to build a DFA that recognizes the longest
89 /// possible match instead of the leftmost first match. This is useful when
90 /// constructing reverse DFAs for finding the start of a match.
91 pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> {
92 self.longest_match = yes;
93 self
94 }
95
96 /// Build the DFA. If there was a problem constructing the DFA (e.g., if
97 /// the chosen state identifier representation is too small), then an error
98 /// is returned.
99 pub fn build(mut self) -> Result<DFARepr<S>> {
100 let representative_bytes: Vec<u8> =
101 self.dfa.byte_classes().representatives().collect();
102 let mut sparse = self.new_sparse_set();
103 let mut uncompiled = vec![self.add_start(&mut sparse)?];
104 while let Some(dfa_id) = uncompiled.pop() {
105 for &b in &representative_bytes {
106 let (next_dfa_id, is_new) =
107 self.cached_state(dfa_id, b, &mut sparse)?;
108 self.dfa.add_transition(dfa_id, b, next_dfa_id);
109 if is_new {
110 uncompiled.push(next_dfa_id);
111 }
112 }
113 }
114
115 // At this point, we shuffle the matching states in the final DFA to
116 // the beginning. This permits a DFA's match loop to detect a match
117 // condition by merely inspecting the current state's identifier, and
118 // avoids the need for any additional auxiliary storage.
119 let is_match: Vec<bool> =
120 self.builder_states.iter().map(|s| s.is_match).collect();
121 self.dfa.shuffle_match_states(&is_match);
122 Ok(self.dfa)
123 }
124
125 /// Return the identifier for the next DFA state given an existing DFA
126 /// state and an input byte. If the next DFA state already exists, then
127 /// return its identifier from the cache. Otherwise, build the state, cache
128 /// it and return its identifier.
129 ///
130 /// The given sparse set is used for scratch space. It must have a capacity
131 /// equivalent to the total number of NFA states, but its contents are
132 /// otherwise unspecified.
133 ///
134 /// This routine returns a boolean indicating whether a new state was
135 /// built. If a new state is built, then the caller needs to add it to its
136 /// frontier of uncompiled DFA states to compute transitions for.
137 fn cached_state(
138 &mut self,
139 dfa_id: S,
140 b: u8,
141 sparse: &mut SparseSet,
142 ) -> Result<(S, bool)> {
143 sparse.clear();
144 // Compute the set of all reachable NFA states, including epsilons.
145 self.next(dfa_id, b, sparse);
146 // Build a candidate state and check if it has already been built.
147 let state = self.new_state(sparse);
148 if let Some(&cached_id) = self.cache.get(&state) {
149 // Since we have a cached state, put the constructed state's
150 // memory back into our scratch space, so that it can be reused.
151 let _ =
152 mem::replace(&mut self.scratch_nfa_states, state.nfa_states);
153 return Ok((cached_id, false));
154 }
155 // Nothing was in the cache, so add this state to the cache.
156 self.add_state(state).map(|s| (s, true))
157 }
158
159 /// Compute the set of all eachable NFA states, including the full epsilon
160 /// closure, from a DFA state for a single byte of input.
161 fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) {
162 next_nfa_states.clear();
163 for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() {
164 let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i];
165 match *self.nfa.state(nfa_id) {
166 nfa::State::Union { .. }
167 | nfa::State::Fail
168 | nfa::State::Match => {}
169 nfa::State::Range { range: ref r } => {
170 if r.start <= b && b <= r.end {
171 self.epsilon_closure(r.next, next_nfa_states);
172 }
173 }
174 nfa::State::Sparse { ref ranges } => {
175 for r in ranges.iter() {
176 if r.start > b {
177 break;
178 } else if r.start <= b && b <= r.end {
179 self.epsilon_closure(r.next, next_nfa_states);
180 break;
181 }
182 }
183 }
184 }
185 }
186 }
187
188 /// Compute the epsilon closure for the given NFA state.
189 fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) {
190 if !self.nfa.state(start).is_epsilon() {
191 set.insert(start);
192 return;
193 }
194
195 self.stack.push(start);
196 while let Some(mut id) = self.stack.pop() {
197 loop {
198 if set.contains(id) {
199 break;
200 }
201 set.insert(id);
202 match *self.nfa.state(id) {
203 nfa::State::Range { .. }
204 | nfa::State::Sparse { .. }
205 | nfa::State::Fail
206 | nfa::State::Match => break,
207 nfa::State::Union { ref alternates } => {
208 id = match alternates.get(0) {
209 None => break,
210 Some(&id) => id,
211 };
212 self.stack.extend(alternates[1..].iter().rev());
213 }
214 }
215 }
216 }
217 }
218
219 /// Compute the initial DFA state and return its identifier.
220 ///
221 /// The sparse set given is used for scratch space, and must have capacity
222 /// equal to the total number of NFA states. Its contents are unspecified.
223 fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> {
224 sparse.clear();
225 self.epsilon_closure(self.nfa.start(), sparse);
226 let state = self.new_state(&sparse);
227 let id = self.add_state(state)?;
228 self.dfa.set_start_state(id);
229 Ok(id)
230 }
231
232 /// Add the given state to the DFA and make it available in the cache.
233 ///
234 /// The state initially has no transitions. That is, it transitions to the
235 /// dead state for all possible inputs.
236 fn add_state(&mut self, state: State) -> Result<S> {
237 let id = self.dfa.add_empty_state()?;
238 let rstate = Rc::new(state);
239 self.builder_states.push(rstate.clone());
240 self.cache.insert(rstate, id);
241 Ok(id)
242 }
243
244 /// Convert the given set of ordered NFA states to a DFA state.
245 fn new_state(&mut self, set: &SparseSet) -> State {
246 let mut state = State {
247 is_match: false,
248 nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]),
249 };
250 state.nfa_states.clear();
251
252 for &id in set {
253 match *self.nfa.state(id) {
254 nfa::State::Range { .. } => {
255 state.nfa_states.push(id);
256 }
257 nfa::State::Sparse { .. } => {
258 state.nfa_states.push(id);
259 }
260 nfa::State::Fail => {
261 break;
262 }
263 nfa::State::Match => {
264 state.is_match = true;
265 if !self.longest_match {
266 break;
267 }
268 }
269 nfa::State::Union { .. } => {}
270 }
271 }
272 state
273 }
274
275 /// Create a new sparse set with enough capacity to hold all NFA states.
276 fn new_sparse_set(&self) -> SparseSet {
277 SparseSet::new(self.nfa.len())
278 }
279}
280
281impl State {
282 /// Create a new empty dead state.
283 fn dead() -> State {
284 State { nfa_states: vec![], is_match: false }
285 }
286}
287