1 | use std::collections::HashMap; |
2 | use std::mem; |
3 | use std::rc::Rc; |
4 | |
5 | use dense; |
6 | use error::Result; |
7 | use nfa::{self, NFA}; |
8 | use sparse_set::SparseSet; |
9 | use state_id::{dead_id, StateID}; |
10 | |
11 | type DFARepr<S> = dense::Repr<Vec<S>, S>; |
12 | |
13 | /// A determinizer converts an NFA to a DFA. |
14 | /// |
15 | /// This determinizer follows the typical powerset construction, where each |
16 | /// DFA state is comprised of one or more NFA states. In the worst case, there |
17 | /// is one DFA state for every possible combination of NFA states. In practice, |
18 | /// this only happens in certain conditions, typically when there are bounded |
19 | /// repetitions. |
20 | /// |
21 | /// The type variable `S` refers to the chosen state identifier representation |
22 | /// used for the DFA. |
23 | /// |
24 | /// The lifetime variable `'a` refers to the lifetime of the NFA being |
25 | /// converted to a DFA. |
26 | #[derive(Debug)] |
27 | pub(crate) struct Determinizer<'a, S: StateID> { |
28 | /// The NFA we're converting into a DFA. |
29 | nfa: &'a NFA, |
30 | /// The DFA we're building. |
31 | dfa: DFARepr<S>, |
32 | /// Each DFA state being built is defined as an *ordered* set of NFA |
33 | /// states, along with a flag indicating whether the state is a match |
34 | /// state or not. |
35 | /// |
36 | /// This is never empty. The first state is always a dummy state such that |
37 | /// a state id == 0 corresponds to a dead state. |
38 | builder_states: Vec<Rc<State>>, |
39 | /// A cache of DFA states that already exist and can be easily looked up |
40 | /// via ordered sets of NFA states. |
41 | cache: HashMap<Rc<State>, S>, |
42 | /// Scratch space for a stack of NFA states to visit, for depth first |
43 | /// visiting without recursion. |
44 | stack: Vec<nfa::StateID>, |
45 | /// Scratch space for storing an ordered sequence of NFA states, for |
46 | /// amortizing allocation. |
47 | scratch_nfa_states: Vec<nfa::StateID>, |
48 | /// Whether to build a DFA that finds the longest possible match. |
49 | longest_match: bool, |
50 | } |
51 | |
52 | /// An intermediate representation for a DFA state during determinization. |
53 | #[derive(Debug, Eq, Hash, PartialEq)] |
54 | struct State { |
55 | /// Whether this state is a match state or not. |
56 | is_match: bool, |
57 | /// An ordered sequence of NFA states that make up this DFA state. |
58 | nfa_states: Vec<nfa::StateID>, |
59 | } |
60 | |
61 | impl<'a, S: StateID> Determinizer<'a, S> { |
62 | /// Create a new determinizer for converting the given NFA to a DFA. |
63 | pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> { |
64 | let dead = Rc::new(State::dead()); |
65 | let mut cache = HashMap::default(); |
66 | cache.insert(dead.clone(), dead_id()); |
67 | |
68 | Determinizer { |
69 | nfa, |
70 | dfa: DFARepr::empty().anchored(nfa.is_anchored()), |
71 | builder_states: vec![dead], |
72 | cache, |
73 | stack: vec![], |
74 | scratch_nfa_states: vec![], |
75 | longest_match: false, |
76 | } |
77 | } |
78 | |
79 | /// Instruct the determinizer to use equivalence classes as the transition |
80 | /// alphabet instead of all possible byte values. |
81 | pub fn with_byte_classes(mut self) -> Determinizer<'a, S> { |
82 | let byte_classes = self.nfa.byte_classes().clone(); |
83 | self.dfa = DFARepr::empty_with_byte_classes(byte_classes) |
84 | .anchored(self.nfa.is_anchored()); |
85 | self |
86 | } |
87 | |
88 | /// Instruct the determinizer to build a DFA that recognizes the longest |
89 | /// possible match instead of the leftmost first match. This is useful when |
90 | /// constructing reverse DFAs for finding the start of a match. |
91 | pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> { |
92 | self.longest_match = yes; |
93 | self |
94 | } |
95 | |
96 | /// Build the DFA. If there was a problem constructing the DFA (e.g., if |
97 | /// the chosen state identifier representation is too small), then an error |
98 | /// is returned. |
99 | pub fn build(mut self) -> Result<DFARepr<S>> { |
100 | let representative_bytes: Vec<u8> = |
101 | self.dfa.byte_classes().representatives().collect(); |
102 | let mut sparse = self.new_sparse_set(); |
103 | let mut uncompiled = vec![self.add_start(&mut sparse)?]; |
104 | while let Some(dfa_id) = uncompiled.pop() { |
105 | for &b in &representative_bytes { |
106 | let (next_dfa_id, is_new) = |
107 | self.cached_state(dfa_id, b, &mut sparse)?; |
108 | self.dfa.add_transition(dfa_id, b, next_dfa_id); |
109 | if is_new { |
110 | uncompiled.push(next_dfa_id); |
111 | } |
112 | } |
113 | } |
114 | |
115 | // At this point, we shuffle the matching states in the final DFA to |
116 | // the beginning. This permits a DFA's match loop to detect a match |
117 | // condition by merely inspecting the current state's identifier, and |
118 | // avoids the need for any additional auxiliary storage. |
119 | let is_match: Vec<bool> = |
120 | self.builder_states.iter().map(|s| s.is_match).collect(); |
121 | self.dfa.shuffle_match_states(&is_match); |
122 | Ok(self.dfa) |
123 | } |
124 | |
125 | /// Return the identifier for the next DFA state given an existing DFA |
126 | /// state and an input byte. If the next DFA state already exists, then |
127 | /// return its identifier from the cache. Otherwise, build the state, cache |
128 | /// it and return its identifier. |
129 | /// |
130 | /// The given sparse set is used for scratch space. It must have a capacity |
131 | /// equivalent to the total number of NFA states, but its contents are |
132 | /// otherwise unspecified. |
133 | /// |
134 | /// This routine returns a boolean indicating whether a new state was |
135 | /// built. If a new state is built, then the caller needs to add it to its |
136 | /// frontier of uncompiled DFA states to compute transitions for. |
137 | fn cached_state( |
138 | &mut self, |
139 | dfa_id: S, |
140 | b: u8, |
141 | sparse: &mut SparseSet, |
142 | ) -> Result<(S, bool)> { |
143 | sparse.clear(); |
144 | // Compute the set of all reachable NFA states, including epsilons. |
145 | self.next(dfa_id, b, sparse); |
146 | // Build a candidate state and check if it has already been built. |
147 | let state = self.new_state(sparse); |
148 | if let Some(&cached_id) = self.cache.get(&state) { |
149 | // Since we have a cached state, put the constructed state's |
150 | // memory back into our scratch space, so that it can be reused. |
151 | let _ = |
152 | mem::replace(&mut self.scratch_nfa_states, state.nfa_states); |
153 | return Ok((cached_id, false)); |
154 | } |
155 | // Nothing was in the cache, so add this state to the cache. |
156 | self.add_state(state).map(|s| (s, true)) |
157 | } |
158 | |
159 | /// Compute the set of all eachable NFA states, including the full epsilon |
160 | /// closure, from a DFA state for a single byte of input. |
161 | fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) { |
162 | next_nfa_states.clear(); |
163 | for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() { |
164 | let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i]; |
165 | match *self.nfa.state(nfa_id) { |
166 | nfa::State::Union { .. } |
167 | | nfa::State::Fail |
168 | | nfa::State::Match => {} |
169 | nfa::State::Range { range: ref r } => { |
170 | if r.start <= b && b <= r.end { |
171 | self.epsilon_closure(r.next, next_nfa_states); |
172 | } |
173 | } |
174 | nfa::State::Sparse { ref ranges } => { |
175 | for r in ranges.iter() { |
176 | if r.start > b { |
177 | break; |
178 | } else if r.start <= b && b <= r.end { |
179 | self.epsilon_closure(r.next, next_nfa_states); |
180 | break; |
181 | } |
182 | } |
183 | } |
184 | } |
185 | } |
186 | } |
187 | |
188 | /// Compute the epsilon closure for the given NFA state. |
189 | fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) { |
190 | if !self.nfa.state(start).is_epsilon() { |
191 | set.insert(start); |
192 | return; |
193 | } |
194 | |
195 | self.stack.push(start); |
196 | while let Some(mut id) = self.stack.pop() { |
197 | loop { |
198 | if set.contains(id) { |
199 | break; |
200 | } |
201 | set.insert(id); |
202 | match *self.nfa.state(id) { |
203 | nfa::State::Range { .. } |
204 | | nfa::State::Sparse { .. } |
205 | | nfa::State::Fail |
206 | | nfa::State::Match => break, |
207 | nfa::State::Union { ref alternates } => { |
208 | id = match alternates.get(0) { |
209 | None => break, |
210 | Some(&id) => id, |
211 | }; |
212 | self.stack.extend(alternates[1..].iter().rev()); |
213 | } |
214 | } |
215 | } |
216 | } |
217 | } |
218 | |
219 | /// Compute the initial DFA state and return its identifier. |
220 | /// |
221 | /// The sparse set given is used for scratch space, and must have capacity |
222 | /// equal to the total number of NFA states. Its contents are unspecified. |
223 | fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> { |
224 | sparse.clear(); |
225 | self.epsilon_closure(self.nfa.start(), sparse); |
226 | let state = self.new_state(&sparse); |
227 | let id = self.add_state(state)?; |
228 | self.dfa.set_start_state(id); |
229 | Ok(id) |
230 | } |
231 | |
232 | /// Add the given state to the DFA and make it available in the cache. |
233 | /// |
234 | /// The state initially has no transitions. That is, it transitions to the |
235 | /// dead state for all possible inputs. |
236 | fn add_state(&mut self, state: State) -> Result<S> { |
237 | let id = self.dfa.add_empty_state()?; |
238 | let rstate = Rc::new(state); |
239 | self.builder_states.push(rstate.clone()); |
240 | self.cache.insert(rstate, id); |
241 | Ok(id) |
242 | } |
243 | |
244 | /// Convert the given set of ordered NFA states to a DFA state. |
245 | fn new_state(&mut self, set: &SparseSet) -> State { |
246 | let mut state = State { |
247 | is_match: false, |
248 | nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]), |
249 | }; |
250 | state.nfa_states.clear(); |
251 | |
252 | for &id in set { |
253 | match *self.nfa.state(id) { |
254 | nfa::State::Range { .. } => { |
255 | state.nfa_states.push(id); |
256 | } |
257 | nfa::State::Sparse { .. } => { |
258 | state.nfa_states.push(id); |
259 | } |
260 | nfa::State::Fail => { |
261 | break; |
262 | } |
263 | nfa::State::Match => { |
264 | state.is_match = true; |
265 | if !self.longest_match { |
266 | break; |
267 | } |
268 | } |
269 | nfa::State::Union { .. } => {} |
270 | } |
271 | } |
272 | state |
273 | } |
274 | |
275 | /// Create a new sparse set with enough capacity to hold all NFA states. |
276 | fn new_sparse_set(&self) -> SparseSet { |
277 | SparseSet::new(self.nfa.len()) |
278 | } |
279 | } |
280 | |
281 | impl State { |
282 | /// Create a new empty dead state. |
283 | fn dead() -> State { |
284 | State { nfa_states: vec![], is_match: false } |
285 | } |
286 | } |
287 | |