1 | use core::{cell::RefCell, mem::size_of}; |
2 | |
3 | use alloc::{string::String, sync::Arc, vec, vec::Vec}; |
4 | |
5 | use crate::{ |
6 | error::Error, |
7 | hir::{self, Hir, HirKind}, |
8 | int::U32, |
9 | }; |
10 | |
11 | pub(crate) type StateID = u32; |
12 | |
13 | #[derive (Clone, Copy, Debug)] |
14 | pub(crate) struct Config { |
15 | pub(crate) size_limit: Option<usize>, |
16 | } |
17 | |
18 | impl Default for Config { |
19 | fn default() -> Config { |
20 | Config { size_limit: Some(10 * (1 << 20)) } |
21 | } |
22 | } |
23 | |
24 | #[derive (Clone)] |
25 | pub(crate) struct NFA { |
26 | /// The pattern string this NFA was generated from. |
27 | /// |
28 | /// We put it here for lack of a better place to put it. ¯\_(ツ)_/¯ |
29 | pattern: String, |
30 | /// The states that make up this NFA. |
31 | states: Vec<State>, |
32 | /// The ID of the start state. |
33 | start: StateID, |
34 | /// Whether this NFA can only match at the beginning of a haystack. |
35 | is_start_anchored: bool, |
36 | /// Whether this NFA can match the empty string. |
37 | is_match_empty: bool, |
38 | /// If every match has the same number of matching capture groups, then |
39 | /// this corresponds to the number of groups. |
40 | static_explicit_captures_len: Option<usize>, |
41 | /// A map from capture group name to its corresponding index. |
42 | cap_name_to_index: CaptureNameMap, |
43 | /// A map from capture group index to the corresponding name, if one |
44 | /// exists. |
45 | cap_index_to_name: Vec<Option<Arc<str>>>, |
46 | /// Heap memory used indirectly by NFA states and other things (like the |
47 | /// various capturing group representations above). Since each state |
48 | /// might use a different amount of heap, we need to keep track of this |
49 | /// incrementally. |
50 | memory_extra: usize, |
51 | } |
52 | |
53 | impl NFA { |
54 | /// Creates a new NFA from the given configuration and HIR. |
55 | pub(crate) fn new( |
56 | config: Config, |
57 | pattern: String, |
58 | hir: &Hir, |
59 | ) -> Result<NFA, Error> { |
60 | Compiler::new(config, pattern).compile(hir) |
61 | } |
62 | |
63 | /// Returns the pattern string used to construct this NFA. |
64 | pub(crate) fn pattern(&self) -> &str { |
65 | &self.pattern |
66 | } |
67 | |
68 | /// Returns the state corresponding to the given ID. |
69 | /// |
70 | /// # Panics |
71 | /// |
72 | /// If the ID does not refer to a valid state, then this panics. |
73 | pub(crate) fn state(&self, id: StateID) -> &State { |
74 | &self.states[id.as_usize()] |
75 | } |
76 | |
77 | /// Returns the total number of states in this NFA. |
78 | pub(crate) fn len(&self) -> usize { |
79 | self.states.len() |
80 | } |
81 | |
82 | /// Returns the ID of the starting state for this NFA. |
83 | pub(crate) fn start(&self) -> StateID { |
84 | self.start |
85 | } |
86 | |
87 | /// Returns the capture group index for the corresponding named group. |
88 | /// If no such group with the given name exists, then `None` is returned. |
89 | pub(crate) fn to_index(&self, name: &str) -> Option<usize> { |
90 | self.cap_name_to_index.get(name).cloned().map(|i| i.as_usize()) |
91 | } |
92 | |
93 | /* |
94 | /// Returns the capture group name for the corresponding index. |
95 | /// If no such group with the given index, then `None` is returned. |
96 | pub(crate) fn to_name(&self, index: usize) -> Option<&str> { |
97 | self.cap_index_to_name.get(index)?.as_deref() |
98 | } |
99 | */ |
100 | |
101 | /// Returns an iterator over all of the capture groups, along with their |
102 | /// names if they exist, in this NFA. |
103 | pub(crate) fn capture_names(&self) -> CaptureNames<'_> { |
104 | CaptureNames { it: self.cap_index_to_name.iter() } |
105 | } |
106 | |
107 | /// Returns the total number of capture groups, including the first and |
108 | /// implicit group, in this NFA. |
109 | pub(crate) fn group_len(&self) -> usize { |
110 | self.cap_index_to_name.len() |
111 | } |
112 | |
113 | /// Returns true if and only if this NFA can only match at the beginning of |
114 | /// a haystack. |
115 | pub(crate) fn is_start_anchored(&self) -> bool { |
116 | self.is_start_anchored |
117 | } |
118 | |
119 | /// If the pattern always reports the same number of matching capture groups |
120 | /// for every match, then this returns the number of those groups. This |
121 | /// doesn't include the implicit group found in every pattern. |
122 | pub(crate) fn static_explicit_captures_len(&self) -> Option<usize> { |
123 | self.static_explicit_captures_len |
124 | } |
125 | |
126 | /// Returns the heap memory usage, in bytes, used by this NFA. |
127 | fn memory_usage(&self) -> usize { |
128 | (self.states.len() * size_of::<State>()) |
129 | + (self.cap_index_to_name.len() * size_of::<Option<Arc<str>>>()) |
130 | + self.memory_extra |
131 | } |
132 | } |
133 | |
134 | impl core::fmt::Debug for NFA { |
135 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
136 | writeln!(f, "NFA(" )?; |
137 | writeln!(f, "pattern: {}" , self.pattern)?; |
138 | for (sid: usize, state: &State) in self.states.iter().enumerate() { |
139 | writeln!(f, " {:07?}: {:?}" , sid, state)?; |
140 | } |
141 | writeln!(f, ")" )?; |
142 | Ok(()) |
143 | } |
144 | } |
145 | |
146 | /// An iterator over all capture groups in an NFA. |
147 | /// |
148 | /// If a particular group has a name, then it is yielded. Otherwise, `None` |
149 | /// is yielded. |
150 | #[derive (Clone, Debug)] |
151 | pub(crate) struct CaptureNames<'a> { |
152 | it: core::slice::Iter<'a, Option<Arc<str>>>, |
153 | } |
154 | |
155 | impl<'a> Iterator for CaptureNames<'a> { |
156 | type Item = Option<&'a str>; |
157 | |
158 | fn next(&mut self) -> Option<Option<&'a str>> { |
159 | self.it.next().map(|n: &'a Option>| n.as_deref()) |
160 | } |
161 | } |
162 | |
163 | #[derive (Clone, Eq, PartialEq)] |
164 | pub(crate) enum State { |
165 | Char { target: StateID, ch: char }, |
166 | Ranges { target: StateID, ranges: Vec<(char, char)> }, |
167 | Splits { targets: Vec<StateID>, reverse: bool }, |
168 | Goto { target: StateID, look: Option<hir::Look> }, |
169 | Capture { target: StateID, slot: u32 }, |
170 | Fail, |
171 | Match, |
172 | } |
173 | |
174 | impl State { |
175 | /// Returns the heap memory usage of this NFA state in bytes. |
176 | fn memory_usage(&self) -> usize { |
177 | match *self { |
178 | State::Char { .. } |
179 | | State::Goto { .. } |
180 | | State::Capture { .. } |
181 | | State::Fail { .. } |
182 | | State::Match => 0, |
183 | State::Splits { ref targets, .. } => { |
184 | targets.len() * size_of::<StateID>() |
185 | } |
186 | State::Ranges { ref ranges, .. } => { |
187 | ranges.len() * size_of::<(char, char)>() |
188 | } |
189 | } |
190 | } |
191 | |
192 | /// Returns an iterator over the given split targets. The order of the |
193 | /// iterator yields elements in reverse when `reverse` is true. |
194 | pub(crate) fn iter_splits<'a>( |
195 | splits: &'a [StateID], |
196 | reverse: bool, |
197 | ) -> impl Iterator<Item = StateID> + 'a { |
198 | let mut it = splits.iter(); |
199 | core::iter::from_fn(move || { |
200 | if reverse { it.next_back() } else { it.next() }.copied() |
201 | }) |
202 | } |
203 | } |
204 | |
205 | impl core::fmt::Debug for State { |
206 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
207 | match *self { |
208 | State::Char { target, ch } => { |
209 | write!(f, " {:?} => {:?}" , ch, target) |
210 | } |
211 | State::Ranges { target, ref ranges } => { |
212 | for (i, &(start, end)) in ranges.iter().enumerate() { |
213 | if i > 0 { |
214 | write!(f, ", " )?; |
215 | } |
216 | write!(f, " {:?}- {:?} => {:?}" , start, end, target)?; |
217 | } |
218 | Ok(()) |
219 | } |
220 | State::Splits { ref targets, reverse } => { |
221 | write!(f, "splits(" )?; |
222 | for (i, sid) in |
223 | State::iter_splits(targets, reverse).enumerate() |
224 | { |
225 | if i > 0 { |
226 | write!(f, ", " )?; |
227 | } |
228 | write!(f, " {:?}" , sid)?; |
229 | } |
230 | write!(f, ")" ) |
231 | } |
232 | State::Goto { target, look: None } => { |
233 | write!(f, "goto( {:?})" , target) |
234 | } |
235 | State::Goto { target, look: Some(look) } => { |
236 | write!(f, " {:?} => {:?}" , look, target) |
237 | } |
238 | State::Capture { target, slot } => { |
239 | write!(f, "capture(slot= {:?}) => {:?}" , slot, target,) |
240 | } |
241 | State::Fail => write!(f, "FAIL" ), |
242 | State::Match => { |
243 | write!(f, "MATCH" ) |
244 | } |
245 | } |
246 | } |
247 | } |
248 | |
249 | /// A map from capture group name to its corresponding capture group index. |
250 | /// |
251 | /// We define a type alias here so that we can transparently use a `HashMap` |
252 | /// whenever it's available. We do so presumably because it's faster, although |
253 | /// there are no benchmarks verifying this. |
254 | #[cfg (feature = "std" )] |
255 | type CaptureNameMap = std::collections::HashMap<Arc<str>, u32>; |
256 | #[cfg (not(feature = "std" ))] |
257 | type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, u32>; |
258 | |
259 | #[derive (Debug)] |
260 | struct Compiler { |
261 | config: Config, |
262 | nfa: RefCell<NFA>, |
263 | } |
264 | |
265 | impl Compiler { |
266 | fn new(config: Config, pattern: String) -> Compiler { |
267 | let nfa = RefCell::new(NFA { |
268 | pattern, |
269 | states: vec![], |
270 | start: 0, |
271 | is_start_anchored: false, |
272 | is_match_empty: false, |
273 | static_explicit_captures_len: None, |
274 | cap_name_to_index: CaptureNameMap::default(), |
275 | cap_index_to_name: vec![], |
276 | memory_extra: 0, |
277 | }); |
278 | Compiler { config, nfa } |
279 | } |
280 | |
281 | fn compile(self, hir: &Hir) -> Result<NFA, Error> { |
282 | self.nfa.borrow_mut().is_start_anchored = hir.is_start_anchored(); |
283 | self.nfa.borrow_mut().is_match_empty = hir.is_match_empty(); |
284 | self.nfa.borrow_mut().static_explicit_captures_len = |
285 | hir.static_explicit_captures_len(); |
286 | let compiled = self.c_capture(0, None, hir)?; |
287 | let mat = self.add(State::Match)?; |
288 | self.patch(compiled.end, mat)?; |
289 | self.nfa.borrow_mut().start = compiled.start; |
290 | Ok(self.nfa.into_inner()) |
291 | } |
292 | |
293 | fn c(&self, hir: &Hir) -> Result<ThompsonRef, Error> { |
294 | match *hir.kind() { |
295 | HirKind::Empty => self.c_empty(), |
296 | HirKind::Char(ch) => self.c_char(ch), |
297 | HirKind::Class(ref class) => self.c_class(class), |
298 | HirKind::Look(ref look) => self.c_look(look), |
299 | HirKind::Repetition(ref rep) => self.c_repetition(rep), |
300 | HirKind::Capture(ref cap) => { |
301 | self.c_capture(cap.index, cap.name.as_deref(), &cap.sub) |
302 | } |
303 | HirKind::Concat(ref subs) => { |
304 | self.c_concat(subs.iter().map(|s| self.c(s))) |
305 | } |
306 | HirKind::Alternation(ref subs) => { |
307 | self.c_alternation(subs.iter().map(|s| self.c(s))) |
308 | } |
309 | } |
310 | } |
311 | |
312 | /// Compile a "fail" state that can never be transitioned out of. |
313 | fn c_fail(&self) -> Result<ThompsonRef, Error> { |
314 | let id = self.add(State::Fail)?; |
315 | Ok(ThompsonRef { start: id, end: id }) |
316 | } |
317 | |
318 | /// Compile an "empty" state with one unconditional epsilon transition. |
319 | /// |
320 | /// Both the `start` and `end` locations point to the state created. |
321 | /// Callers will likely want to keep the `start`, but patch the `end` to |
322 | /// point to some other state. |
323 | fn c_empty(&self) -> Result<ThompsonRef, Error> { |
324 | let id = self.add_empty()?; |
325 | Ok(ThompsonRef { start: id, end: id }) |
326 | } |
327 | |
328 | /// Compile the given literal char to an NFA. |
329 | fn c_char(&self, ch: char) -> Result<ThompsonRef, Error> { |
330 | let id = self.add(State::Char { target: 0, ch })?; |
331 | Ok(ThompsonRef { start: id, end: id }) |
332 | } |
333 | |
334 | /// Compile the given character class into an NFA. |
335 | /// |
336 | /// If the class is empty, then this compiles to a `Fail` state. |
337 | fn c_class(&self, class: &hir::Class) -> Result<ThompsonRef, Error> { |
338 | let id = if class.ranges.is_empty() { |
339 | // Technically using an explicit fail state probably isn't |
340 | // necessary. Because if you try to match against an empty Ranges, |
341 | // then it should turn up with nothing regardless of input, and |
342 | // thus "acts" like a Fail state. But it's better to be more |
343 | // explicit, and there's no real cost to doing so. |
344 | self.add(State::Fail) |
345 | } else { |
346 | let ranges = |
347 | class.ranges.iter().map(|r| (r.start, r.end)).collect(); |
348 | self.add(State::Ranges { target: 0, ranges }) |
349 | }?; |
350 | Ok(ThompsonRef { start: id, end: id }) |
351 | } |
352 | |
353 | /// Compile the given HIR look-around assertion to an NFA look-around |
354 | /// assertion. |
355 | fn c_look(&self, look: &hir::Look) -> Result<ThompsonRef, Error> { |
356 | let id = self.add(State::Goto { target: 0, look: Some(*look) })?; |
357 | Ok(ThompsonRef { start: id, end: id }) |
358 | } |
359 | |
360 | /// Compile the given repetition expression. This handles all types of |
361 | /// repetitions and greediness. |
362 | fn c_repetition( |
363 | &self, |
364 | rep: &hir::Repetition, |
365 | ) -> Result<ThompsonRef, Error> { |
366 | match (rep.min, rep.max) { |
367 | (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy), |
368 | (min, None) => self.c_at_least(&rep.sub, rep.greedy, min), |
369 | (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min), |
370 | (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max), |
371 | } |
372 | } |
373 | |
374 | /// Compile the given expression such that it matches at least `min` times, |
375 | /// but no more than `max` times. |
376 | /// |
377 | /// When `greedy` is true, then the preference is for the expression to |
378 | /// match as much as possible. Otherwise, it will match as little as |
379 | /// possible. |
380 | fn c_bounded( |
381 | &self, |
382 | hir: &Hir, |
383 | greedy: bool, |
384 | min: u32, |
385 | max: u32, |
386 | ) -> Result<ThompsonRef, Error> { |
387 | let prefix = self.c_exactly(hir, min)?; |
388 | if min == max { |
389 | return Ok(prefix); |
390 | } |
391 | |
392 | // It is tempting here to compile the rest here as a concatenation |
393 | // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it |
394 | // were `aaa?a?a?`. The problem here is that it leads to this program: |
395 | // |
396 | // >000000: 61 => 01 |
397 | // 000001: 61 => 02 |
398 | // 000002: union(03, 04) |
399 | // 000003: 61 => 04 |
400 | // 000004: union(05, 06) |
401 | // 000005: 61 => 06 |
402 | // 000006: union(07, 08) |
403 | // 000007: 61 => 08 |
404 | // 000008: MATCH |
405 | // |
406 | // And effectively, once you hit state 2, the epsilon closure will |
407 | // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better |
408 | // to instead compile it like so: |
409 | // |
410 | // >000000: 61 => 01 |
411 | // 000001: 61 => 02 |
412 | // 000002: union(03, 08) |
413 | // 000003: 61 => 04 |
414 | // 000004: union(05, 08) |
415 | // 000005: 61 => 06 |
416 | // 000006: union(07, 08) |
417 | // 000007: 61 => 08 |
418 | // 000008: MATCH |
419 | // |
420 | // So that the epsilon closure of state 2 is now just 3 and 8. |
421 | let empty = self.add_empty()?; |
422 | let mut prev_end = prefix.end; |
423 | for _ in min..max { |
424 | let splits = |
425 | self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
426 | let compiled = self.c(hir)?; |
427 | self.patch(prev_end, splits)?; |
428 | self.patch(splits, compiled.start)?; |
429 | self.patch(splits, empty)?; |
430 | prev_end = compiled.end; |
431 | } |
432 | self.patch(prev_end, empty)?; |
433 | Ok(ThompsonRef { start: prefix.start, end: empty }) |
434 | } |
435 | |
436 | /// Compile the given expression such that it may be matched `n` or more |
437 | /// times, where `n` can be any integer. (Although a particularly large |
438 | /// integer is likely to run afoul of any configured size limits.) |
439 | /// |
440 | /// When `greedy` is true, then the preference is for the expression to |
441 | /// match as much as possible. Otherwise, it will match as little as |
442 | /// possible. |
443 | fn c_at_least( |
444 | &self, |
445 | hir: &Hir, |
446 | greedy: bool, |
447 | n: u32, |
448 | ) -> Result<ThompsonRef, Error> { |
449 | if n == 0 { |
450 | // When the expression cannot match the empty string, then we |
451 | // can get away with something much simpler: just one 'alt' |
452 | // instruction that optionally repeats itself. But if the expr |
453 | // can match the empty string... see below. |
454 | if !hir.is_match_empty() { |
455 | let splits = self.add(State::Splits { |
456 | targets: vec![], |
457 | reverse: !greedy, |
458 | })?; |
459 | let compiled = self.c(hir)?; |
460 | self.patch(splits, compiled.start)?; |
461 | self.patch(compiled.end, splits)?; |
462 | return Ok(ThompsonRef { start: splits, end: splits }); |
463 | } |
464 | |
465 | // What's going on here? Shouldn't x* be simpler than this? It |
466 | // turns out that when implementing leftmost-first (Perl-like) |
467 | // match semantics, x* results in an incorrect preference order |
468 | // when computing the transitive closure of states if and only if |
469 | // 'x' can match the empty string. So instead, we compile x* as |
470 | // (x+)?, which preserves the correct preference order. |
471 | // |
472 | // See: https://github.com/rust-lang/regex/issues/779 |
473 | let compiled = self.c(hir)?; |
474 | let plus = |
475 | self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
476 | self.patch(compiled.end, plus)?; |
477 | self.patch(plus, compiled.start)?; |
478 | |
479 | let question = |
480 | self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
481 | let empty = self.add_empty()?; |
482 | self.patch(question, compiled.start)?; |
483 | self.patch(question, empty)?; |
484 | self.patch(plus, empty)?; |
485 | Ok(ThompsonRef { start: question, end: empty }) |
486 | } else if n == 1 { |
487 | let compiled = self.c(hir)?; |
488 | let splits = |
489 | self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
490 | self.patch(compiled.end, splits)?; |
491 | self.patch(splits, compiled.start)?; |
492 | Ok(ThompsonRef { start: compiled.start, end: splits }) |
493 | } else { |
494 | let prefix = self.c_exactly(hir, n - 1)?; |
495 | let last = self.c(hir)?; |
496 | let splits = |
497 | self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
498 | self.patch(prefix.end, last.start)?; |
499 | self.patch(last.end, splits)?; |
500 | self.patch(splits, last.start)?; |
501 | Ok(ThompsonRef { start: prefix.start, end: splits }) |
502 | } |
503 | } |
504 | |
505 | /// Compile the given expression such that it may be matched zero or one |
506 | /// times. |
507 | /// |
508 | /// When `greedy` is true, then the preference is for the expression to |
509 | /// match as much as possible. Otherwise, it will match as little as |
510 | /// possible. |
511 | fn c_zero_or_one( |
512 | &self, |
513 | hir: &Hir, |
514 | greedy: bool, |
515 | ) -> Result<ThompsonRef, Error> { |
516 | let splits = |
517 | self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
518 | let compiled = self.c(hir)?; |
519 | let empty = self.add_empty()?; |
520 | self.patch(splits, compiled.start)?; |
521 | self.patch(splits, empty)?; |
522 | self.patch(compiled.end, empty)?; |
523 | Ok(ThompsonRef { start: splits, end: empty }) |
524 | } |
525 | |
526 | /// Compile the given HIR expression exactly `n` times. |
527 | fn c_exactly(&self, hir: &Hir, n: u32) -> Result<ThompsonRef, Error> { |
528 | self.c_concat((0..n).map(|_| self.c(hir))) |
529 | } |
530 | |
531 | /// Compile the given expression and insert capturing states at the |
532 | /// beginning and end of it. The slot for the capture states is computed |
533 | /// from the index. |
534 | fn c_capture( |
535 | &self, |
536 | index: u32, |
537 | name: Option<&str>, |
538 | hir: &Hir, |
539 | ) -> Result<ThompsonRef, Error> { |
540 | // For discontiguous indices, push placeholders for earlier capture |
541 | // groups that weren't explicitly added. This can happen, for example, |
542 | // with patterns like '(a){0}(a)' where '(a){0}' is completely removed |
543 | // from the pattern. |
544 | let existing_groups_len = self.nfa.borrow().cap_index_to_name.len(); |
545 | for _ in 0..(index.as_usize().saturating_sub(existing_groups_len)) { |
546 | self.nfa.borrow_mut().cap_index_to_name.push(None); |
547 | } |
548 | if index.as_usize() >= existing_groups_len { |
549 | if let Some(name) = name { |
550 | let name = Arc::from(name); |
551 | let mut nfa = self.nfa.borrow_mut(); |
552 | nfa.cap_name_to_index.insert(Arc::clone(&name), index); |
553 | nfa.cap_index_to_name.push(Some(Arc::clone(&name))); |
554 | // This is an approximation. |
555 | nfa.memory_extra += name.len() + size_of::<u32>(); |
556 | } else { |
557 | self.nfa.borrow_mut().cap_index_to_name.push(None); |
558 | } |
559 | } |
560 | |
561 | let Some(slot) = index.checked_mul(2) else { |
562 | return Err(Error::new("capture group slots exhausted" )); |
563 | }; |
564 | let start = self.add(State::Capture { target: 0, slot })?; |
565 | let inner = self.c(hir)?; |
566 | let Some(slot) = slot.checked_add(1) else { |
567 | return Err(Error::new("capture group slots exhausted" )); |
568 | }; |
569 | let end = self.add(State::Capture { target: 0, slot })?; |
570 | self.patch(start, inner.start)?; |
571 | self.patch(inner.end, end)?; |
572 | |
573 | Ok(ThompsonRef { start, end }) |
574 | } |
575 | |
576 | /// Compile a concatenation of the sub-expressions yielded by the given |
577 | /// iterator. If the iterator yields no elements, then this compiles down |
578 | /// to an "empty" state that always matches. |
579 | fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error> |
580 | where |
581 | I: Iterator<Item = Result<ThompsonRef, Error>>, |
582 | { |
583 | let ThompsonRef { start, mut end } = match it.next() { |
584 | Some(result) => result?, |
585 | None => return self.c_empty(), |
586 | }; |
587 | for result in it { |
588 | let compiled = result?; |
589 | self.patch(end, compiled.start)?; |
590 | end = compiled.end; |
591 | } |
592 | Ok(ThompsonRef { start, end }) |
593 | } |
594 | |
595 | /// Compile an alternation, where each element yielded by the given |
596 | /// iterator represents an item in the alternation. If the iterator yields |
597 | /// no elements, then this compiles down to a "fail" state. |
598 | /// |
599 | /// In an alternation, expressions appearing earlier are "preferred" at |
600 | /// match time over expressions appearing later. (This is currently always |
601 | /// true, as this crate only supports leftmost-first semantics.) |
602 | fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef, Error> |
603 | where |
604 | I: Iterator<Item = Result<ThompsonRef, Error>>, |
605 | { |
606 | let first = match it.next() { |
607 | None => return self.c_fail(), |
608 | Some(result) => result?, |
609 | }; |
610 | let second = match it.next() { |
611 | None => return Ok(first), |
612 | Some(result) => result?, |
613 | }; |
614 | |
615 | let splits = |
616 | self.add(State::Splits { targets: vec![], reverse: false })?; |
617 | let end = self.add_empty()?; |
618 | self.patch(splits, first.start)?; |
619 | self.patch(first.end, end)?; |
620 | self.patch(splits, second.start)?; |
621 | self.patch(second.end, end)?; |
622 | for result in it { |
623 | let compiled = result?; |
624 | self.patch(splits, compiled.start)?; |
625 | self.patch(compiled.end, end)?; |
626 | } |
627 | Ok(ThompsonRef { start: splits, end }) |
628 | } |
629 | |
630 | /// A convenience routine for adding an empty state, also known as an |
631 | /// unconditional epsilon transition. These are quite useful for making |
632 | /// NFA construction simpler. |
633 | /// |
634 | /// (In the regex crate, we do a second pass to remove these, but don't |
635 | /// bother with that here.) |
636 | fn add_empty(&self) -> Result<StateID, Error> { |
637 | self.add(State::Goto { target: 0, look: None }) |
638 | } |
639 | |
640 | /// The common implementation of "add a state." It handles the common |
641 | /// error cases of state ID exhausting (by owning state ID allocation) and |
642 | /// whether the size limit has been exceeded. |
643 | fn add(&self, state: State) -> Result<StateID, Error> { |
644 | let id = u32::try_from(self.nfa.borrow().states.len()) |
645 | .map_err(|_| Error::new("exhausted state IDs, too many states" ))?; |
646 | self.nfa.borrow_mut().memory_extra += state.memory_usage(); |
647 | self.nfa.borrow_mut().states.push(state); |
648 | self.check_size_limit()?; |
649 | Ok(id) |
650 | } |
651 | |
652 | /// Add a transition from one state to another. |
653 | /// |
654 | /// This routine is called "patch" since it is very common to add the |
655 | /// states you want, typically with "dummy" state ID transitions, and then |
656 | /// "patch" in the real state IDs later. This is because you don't always |
657 | /// know all of the necessary state IDs to add because they might not |
658 | /// exist yet. |
659 | /// |
660 | /// # Errors |
661 | /// |
662 | /// This may error if patching leads to an increase in heap usage beyond |
663 | /// the configured size limit. Heap usage only grows when patching adds a |
664 | /// new transition (as in the case of a "splits" state). |
665 | fn patch(&self, from: StateID, to: StateID) -> Result<(), Error> { |
666 | let mut new_memory_extra = self.nfa.borrow().memory_extra; |
667 | match self.nfa.borrow_mut().states[from.as_usize()] { |
668 | State::Char { ref mut target, .. } => { |
669 | *target = to; |
670 | } |
671 | State::Ranges { ref mut target, .. } => { |
672 | *target = to; |
673 | } |
674 | State::Splits { ref mut targets, .. } => { |
675 | targets.push(to); |
676 | new_memory_extra += size_of::<StateID>(); |
677 | } |
678 | State::Goto { ref mut target, .. } => { |
679 | *target = to; |
680 | } |
681 | State::Capture { ref mut target, .. } => { |
682 | *target = to; |
683 | } |
684 | State::Fail | State::Match => {} |
685 | } |
686 | if new_memory_extra != self.nfa.borrow().memory_extra { |
687 | self.nfa.borrow_mut().memory_extra = new_memory_extra; |
688 | self.check_size_limit()?; |
689 | } |
690 | Ok(()) |
691 | } |
692 | |
693 | /// Checks that the current heap memory usage of the NFA being compiled |
694 | /// doesn't exceed the configured size limit. If it does, an error is |
695 | /// returned. |
696 | fn check_size_limit(&self) -> Result<(), Error> { |
697 | if let Some(limit) = self.config.size_limit { |
698 | if self.nfa.borrow().memory_usage() > limit { |
699 | return Err(Error::new("compiled regex exceeded size limit" )); |
700 | } |
701 | } |
702 | Ok(()) |
703 | } |
704 | } |
705 | |
706 | /// A value that represents the result of compiling a sub-expression of a |
707 | /// regex's HIR. Specifically, this represents a sub-graph of the NFA that |
708 | /// has an initial state at `start` and a final state at `end`. |
709 | #[derive (Clone, Copy, Debug)] |
710 | struct ThompsonRef { |
711 | start: StateID, |
712 | end: StateID, |
713 | } |
714 | |