1use std::fmt;
2
3use classes::ByteClasses;
4pub use nfa::compiler::Builder;
5
6mod compiler;
7mod map;
8mod range_trie;
9
10/// The representation for an NFA state identifier.
11pub type StateID = usize;
12
13/// A final compiled NFA.
14///
15/// The states of the NFA are indexed by state IDs, which are how transitions
16/// are expressed.
17#[derive(Clone)]
18pub struct NFA {
19 /// Whether this NFA can only match at the beginning of input or not.
20 ///
21 /// When true, a match should only be reported if it begins at the 0th
22 /// index of the haystack.
23 anchored: bool,
24 /// The starting state of this NFA.
25 start: StateID,
26 /// The state list. This list is guaranteed to be indexable by the starting
27 /// state ID, and it is also guaranteed to contain exactly one `Match`
28 /// state.
29 states: Vec<State>,
30 /// A mapping from any byte value to its corresponding equivalence class
31 /// identifier. Two bytes in the same equivalence class cannot discriminate
32 /// between a match or a non-match. This map can be used to shrink the
33 /// total size of a DFA's transition table with a small match-time cost.
34 ///
35 /// Note that the NFA's transitions are *not* defined in terms of these
36 /// equivalence classes. The NFA's transitions are defined on the original
37 /// byte values. For the most part, this is because they wouldn't really
38 /// help the NFA much since the NFA already uses a sparse representation
39 /// to represent transitions. Byte classes are most effective in a dense
40 /// representation.
41 byte_classes: ByteClasses,
42}
43
44impl NFA {
45 /// Returns an NFA that always matches at every position.
46 pub fn always_match() -> NFA {
47 NFA {
48 anchored: false,
49 start: 0,
50 states: vec![State::Match],
51 byte_classes: ByteClasses::empty(),
52 }
53 }
54
55 /// Returns an NFA that never matches at any position.
56 pub fn never_match() -> NFA {
57 NFA {
58 anchored: false,
59 start: 0,
60 states: vec![State::Fail],
61 byte_classes: ByteClasses::empty(),
62 }
63 }
64
65 /// Returns true if and only if this NFA is anchored.
66 pub fn is_anchored(&self) -> bool {
67 self.anchored
68 }
69
70 /// Return the number of states in this NFA.
71 pub fn len(&self) -> usize {
72 self.states.len()
73 }
74
75 /// Return the ID of the initial state of this NFA.
76 pub fn start(&self) -> StateID {
77 self.start
78 }
79
80 /// Return the NFA state corresponding to the given ID.
81 pub fn state(&self, id: StateID) -> &State {
82 &self.states[id]
83 }
84
85 /// Return the set of equivalence classes for this NFA. The slice returned
86 /// always has length 256 and maps each possible byte value to its
87 /// corresponding equivalence class ID (which is never more than 255).
88 pub fn byte_classes(&self) -> &ByteClasses {
89 &self.byte_classes
90 }
91}
92
93impl fmt::Debug for NFA {
94 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
95 for (i: usize, state: &State) in self.states.iter().enumerate() {
96 let status: char = if i == self.start { '>' } else { ' ' };
97 writeln!(f, "{}{:06}: {:?}", status, i, state)?;
98 }
99 Ok(())
100 }
101}
102
103/// A state in a final compiled NFA.
104#[derive(Clone, Eq, PartialEq)]
105pub enum State {
106 /// A state that transitions to `next` if and only if the current input
107 /// byte is in the range `[start, end]` (inclusive).
108 ///
109 /// This is a special case of Sparse in that it encodes only one transition
110 /// (and therefore avoids the allocation).
111 Range { range: Transition },
112 /// A state with possibly many transitions, represented in a sparse
113 /// fashion. Transitions are ordered lexicographically by input range.
114 /// As such, this may only be used when every transition has equal
115 /// priority. (In practice, this is only used for encoding large UTF-8
116 /// automata.)
117 Sparse { ranges: Box<[Transition]> },
118 /// An alternation such that there exists an epsilon transition to all
119 /// states in `alternates`, where matches found via earlier transitions
120 /// are preferred over later transitions.
121 Union { alternates: Box<[StateID]> },
122 /// A fail state. When encountered, the automaton is guaranteed to never
123 /// reach a match state.
124 Fail,
125 /// A match state. There is exactly one such occurrence of this state in
126 /// an NFA.
127 Match,
128}
129
130/// A transition to another state, only if the given byte falls in the
131/// inclusive range specified.
132#[derive(Clone, Copy, Eq, Hash, PartialEq)]
133pub struct Transition {
134 pub start: u8,
135 pub end: u8,
136 pub next: StateID,
137}
138
139impl State {
140 /// Returns true if and only if this state contains one or more epsilon
141 /// transitions.
142 pub fn is_epsilon(&self) -> bool {
143 match *self {
144 State::Range { .. }
145 | State::Sparse { .. }
146 | State::Fail
147 | State::Match => false,
148 State::Union { .. } => true,
149 }
150 }
151
152 /// Remap the transitions in this state using the given map. Namely, the
153 /// given map should be indexed according to the transitions currently
154 /// in this state.
155 ///
156 /// This is used during the final phase of the NFA compiler, which turns
157 /// its intermediate NFA into the final NFA.
158 fn remap(&mut self, remap: &[StateID]) {
159 match *self {
160 State::Range { ref mut range } => range.next = remap[range.next],
161 State::Sparse { ref mut ranges } => {
162 for r in ranges.iter_mut() {
163 r.next = remap[r.next];
164 }
165 }
166 State::Union { ref mut alternates } => {
167 for alt in alternates.iter_mut() {
168 *alt = remap[*alt];
169 }
170 }
171 State::Fail => {}
172 State::Match => {}
173 }
174 }
175}
176
177impl fmt::Debug for State {
178 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
179 match *self {
180 State::Range { ref range } => range.fmt(f),
181 State::Sparse { ref ranges } => {
182 let rs = ranges
183 .iter()
184 .map(|t| format!("{:?}", t))
185 .collect::<Vec<String>>()
186 .join(", ");
187 write!(f, "sparse({})", rs)
188 }
189 State::Union { ref alternates } => {
190 let alts = alternates
191 .iter()
192 .map(|id| format!("{}", id))
193 .collect::<Vec<String>>()
194 .join(", ");
195 write!(f, "alt({})", alts)
196 }
197 State::Fail => write!(f, "FAIL"),
198 State::Match => write!(f, "MATCH"),
199 }
200 }
201}
202
203impl fmt::Debug for Transition {
204 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
205 let Transition { start: u8, end: u8, next: usize } = *self;
206 if self.start == self.end {
207 write!(f, "{} => {}", escape(start), next)
208 } else {
209 write!(f, "{}-{} => {}", escape(start), escape(end), next)
210 }
211 }
212}
213
214/// Return the given byte as its escaped string form.
215fn escape(b: u8) -> String {
216 use std::ascii;
217
218 String::from_utf8(vec:ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
219}
220
221#[cfg(test)]
222mod tests {
223 use super::*;
224 use dense;
225 use dfa::DFA;
226
227 #[test]
228 fn always_match() {
229 let nfa = NFA::always_match();
230 let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
231
232 assert_eq!(Some(0), dfa.find_at(b"", 0));
233 assert_eq!(Some(0), dfa.find_at(b"a", 0));
234 assert_eq!(Some(1), dfa.find_at(b"a", 1));
235 assert_eq!(Some(0), dfa.find_at(b"ab", 0));
236 assert_eq!(Some(1), dfa.find_at(b"ab", 1));
237 assert_eq!(Some(2), dfa.find_at(b"ab", 2));
238 }
239
240 #[test]
241 fn never_match() {
242 let nfa = NFA::never_match();
243 let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
244
245 assert_eq!(None, dfa.find_at(b"", 0));
246 assert_eq!(None, dfa.find_at(b"a", 0));
247 assert_eq!(None, dfa.find_at(b"a", 1));
248 assert_eq!(None, dfa.find_at(b"ab", 0));
249 assert_eq!(None, dfa.find_at(b"ab", 1));
250 assert_eq!(None, dfa.find_at(b"ab", 2));
251 }
252}
253