mod.rs source code [crates/regex-automata-0.1.10/src/nfa/mod.rs]

1	use std::fmt;
2
3	use classes::ByteClasses;
4	pub use nfa::compiler::Builder;
5
6	mod compiler;
7	mod map;
8	mod range_trie;
9
10	/// The representation for an NFA state identifier.
11	pub type StateID = usize;
12
13	/// A final compiled NFA.
14	///
15	/// The states of the NFA are indexed by state IDs, which are how transitions
16	/// are expressed.
17	#[derive(Clone)]
18	pub struct NFA {
19	/// Whether this NFA can only match at the beginning of input or not.
20	///
21	/// When true, a match should only be reported if it begins at the 0th
22	/// index of the haystack.
23	anchored: bool,
24	/// The starting state of this NFA.
25	start: StateID,
26	/// The state list. This list is guaranteed to be indexable by the starting
27	/// state ID, and it is also guaranteed to contain exactly one `Match`
28	/// state.
29	states: Vec<State>,
30	/// A mapping from any byte value to its corresponding equivalence class
31	/// identifier. Two bytes in the same equivalence class cannot discriminate
32	/// between a match or a non-match. This map can be used to shrink the
33	/// total size of a DFA's transition table with a small match-time cost.
34	///
35	/// Note that the NFA's transitions are not* defined in terms of these*
36	/// equivalence classes. The NFA's transitions are defined on the original
37	/// byte values. For the most part, this is because they wouldn't really
38	/// help the NFA much since the NFA already uses a sparse representation
39	/// to represent transitions. Byte classes are most effective in a dense
40	/// representation.
41	byte_classes: ByteClasses,
42	}
43
44	impl NFA {
45	/// Returns an NFA that always matches at every position.
46	pub fn always_match() -> NFA {
47	NFA {
48	anchored: `false`,
49	start: `0`,
50	states: vec![State::Match],
51	byte_classes: ByteClasses::empty(),
52	}
53	}
54
55	/// Returns an NFA that never matches at any position.
56	pub fn never_match() -> NFA {
57	NFA {
58	anchored: `false`,
59	start: `0`,
60	states: vec![State::Fail],
61	byte_classes: ByteClasses::empty(),
62	}
63	}
64
65	/// Returns true if and only if this NFA is anchored.
66	pub fn is_anchored(&self) -> bool {
67	self.anchored
68	}
69
70	/// Return the number of states in this NFA.
71	pub fn len(&self) -> usize {
72	self.states.len()
73	}
74
75	/// Return the ID of the initial state of this NFA.
76	pub fn start(&self) -> StateID {
77	self.start
78	}
79
80	/// Return the NFA state corresponding to the given ID.
81	pub fn state(&self, id: StateID) -> &State {
82	&self.states[id]
83	}
84
85	/// Return the set of equivalence classes for this NFA. The slice returned
86	/// always has length 256 and maps each possible byte value to its
87	/// corresponding equivalence class ID (which is never more than 255).
88	pub fn byte_classes(&self) -> &ByteClasses {
89	&self.byte_classes
90	}
91	}
92
93	impl fmt::Debug for NFA {
94	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
95	for (i: usize, state: &State) in self.states.iter().enumerate() {
96	let status: char = if i == self.start { '>' } else { ' ' };
97	writeln!(f, "{}{:`06`}: {:?}", status, i, state)?;
98	}
99	Ok(())
100	}
101	}
102
103	/// A state in a final compiled NFA.
104	#[derive(Clone, Eq, PartialEq)]
105	pub enum State {
106	/// A state that transitions to `next` if and only if the current input
107	/// byte is in the range `[start, end]` (inclusive).
108	///
109	/// This is a special case of Sparse in that it encodes only one transition
110	/// (and therefore avoids the allocation).
111	Range { range: Transition },
112	/// A state with possibly many transitions, represented in a sparse
113	/// fashion. Transitions are ordered lexicographically by input range.
114	/// As such, this may only be used when every transition has equal
115	/// priority. (In practice, this is only used for encoding large UTF-8
116	/// automata.)
117	Sparse { ranges: Box<[Transition]> },
118	/// An alternation such that there exists an epsilon transition to all
119	/// states in `alternates`, where matches found via earlier transitions
120	/// are preferred over later transitions.
121	Union { alternates: Box<[StateID]> },
122	/// A fail state. When encountered, the automaton is guaranteed to never
123	/// reach a match state.
124	Fail,
125	/// A match state. There is exactly one such occurrence of this state in
126	/// an NFA.
127	Match,
128	}
129
130	/// A transition to another state, only if the given byte falls in the
131	/// inclusive range specified.
132	#[derive(Clone, Copy, Eq, Hash, PartialEq)]
133	pub struct Transition {
134	pub start: u8,
135	pub end: u8,
136	pub next: StateID,
137	}
138
139	impl State {
140	/// Returns true if and only if this state contains one or more epsilon
141	/// transitions.
142	pub fn is_epsilon(&self) -> bool {
143	match *self {
144	State::Range { .. }
145	\| State::Sparse { .. }
146	\| State::Fail
147	\| State::Match => `false`,
148	State::Union { .. } => `true`,
149	}
150	}
151
152	/// Remap the transitions in this state using the given map. Namely, the
153	/// given map should be indexed according to the transitions currently
154	/// in this state.
155	///
156	/// This is used during the final phase of the NFA compiler, which turns
157	/// its intermediate NFA into the final NFA.
158	fn remap(&mut self, remap: &[StateID]) {
159	match *self {
160	State::Range { ref mut range } => range.next = remap[range.next],
161	State::Sparse { ref mut ranges } => {
162	for r in ranges.iter_mut() {
163	r.next = remap[r.next];
164	}
165	}
166	State::Union { ref mut alternates } => {
167	for alt in alternates.iter_mut() {
168	alt = remap[alt];
169	}
170	}
171	State::Fail => {}
172	State::Match => {}
173	}
174	}
175	}
176
177	impl fmt::Debug for State {
178	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
179	match *self {
180	State::Range { ref range } => range.fmt(f),
181	State::Sparse { ref ranges } => {
182	let rs = ranges
183	.iter()
184	.map(\|t\| format!("{:?}", t))
185	.collect::<Vec<String>>()
186	.join(", ");
187	write!(f, "sparse({})", rs)
188	}
189	State::Union { ref alternates } => {
190	let alts = alternates
191	.iter()
192	.map(\|id\| format!("{}", id))
193	.collect::<Vec<String>>()
194	.join(", ");
195	write!(f, "alt({})", alts)
196	}
197	State::Fail => write!(f, "FAIL"),
198	State::Match => write!(f, "MATCH"),
199	}
200	}
201	}
202
203	impl fmt::Debug for Transition {
204	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
205	let Transition { start: u8, end: u8, next: usize } = *self;
206	if self.start == self.end {
207	write!(f, "{} => {}", escape(start), next)
208	} else {
209	write!(f, "{}-{} => {}", escape(start), escape(end), next)
210	}
211	}
212	}
213
214	/// Return the given byte as its escaped string form.
215	fn escape(b: u8) -> String {
216	use std::ascii;
217
218	String::from_utf8(vec:ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
219	}
220
221	#[cfg(test)]
222	mod tests {
223	use super::*;
224	use dense;
225	use dfa::DFA;
226
227	#[test]
228	fn always_match() {
229	let nfa = NFA::always_match();
230	let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
231
232	assert_eq!(Some(`0`), dfa.find_at(b"", `0`));
233	assert_eq!(Some(`0`), dfa.find_at(b"a", `0`));
234	assert_eq!(Some(`1`), dfa.find_at(b"a", `1`));
235	assert_eq!(Some(`0`), dfa.find_at(b"ab", `0`));
236	assert_eq!(Some(`1`), dfa.find_at(b"ab", `1`));
237	assert_eq!(Some(`2`), dfa.find_at(b"ab", `2`));
238	}
239
240	#[test]
241	fn never_match() {
242	let nfa = NFA::never_match();
243	let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
244
245	assert_eq!(None, dfa.find_at(b"", `0`));
246	assert_eq!(None, dfa.find_at(b"a", `0`));
247	assert_eq!(None, dfa.find_at(b"a", `1`));
248	assert_eq!(None, dfa.find_at(b"ab", `0`));
249	assert_eq!(None, dfa.find_at(b"ab", `1`));
250	assert_eq!(None, dfa.find_at(b"ab", `2`));
251	}
252	}
253