1 | use std::collections::HashMap; |
2 | use std::fmt; |
3 | use std::iter; |
4 | use std::result; |
5 | use std::sync::Arc; |
6 | |
7 | use regex_syntax::hir::{self, Hir, Look}; |
8 | use regex_syntax::is_word_byte; |
9 | use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; |
10 | |
11 | use crate::prog::{ |
12 | EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges, |
13 | InstSave, InstSplit, Program, |
14 | }; |
15 | |
16 | use crate::Error; |
17 | |
18 | type Result = result::Result<Patch, Error>; |
19 | type ResultOrEmpty = result::Result<Option<Patch>, Error>; |
20 | |
21 | #[derive (Debug)] |
22 | struct Patch { |
23 | hole: Hole, |
24 | entry: InstPtr, |
25 | } |
26 | |
27 | /// A compiler translates a regular expression AST to a sequence of |
28 | /// instructions. The sequence of instructions represents an NFA. |
29 | // `Compiler` is only public via the `internal` module, so avoid deriving |
30 | // `Debug`. |
31 | #[allow (missing_debug_implementations)] |
32 | pub struct Compiler { |
33 | insts: Vec<MaybeInst>, |
34 | compiled: Program, |
35 | capture_name_idx: HashMap<String, usize>, |
36 | num_exprs: usize, |
37 | size_limit: usize, |
38 | suffix_cache: SuffixCache, |
39 | utf8_seqs: Option<Utf8Sequences>, |
40 | byte_classes: ByteClassSet, |
41 | // This keeps track of extra bytes allocated while compiling the regex |
42 | // program. Currently, this corresponds to two things. First is the heap |
43 | // memory allocated by Unicode character classes ('InstRanges'). Second is |
44 | // a "fake" amount of memory used by empty sub-expressions, so that enough |
45 | // empty sub-expressions will ultimately trigger the compiler to bail |
46 | // because of a size limit restriction. (That empty sub-expressions don't |
47 | // add to heap memory usage is more-or-less an implementation detail.) In |
48 | // the second case, if we don't bail, then an excessively large repetition |
49 | // on an empty sub-expression can result in the compiler using a very large |
50 | // amount of CPU time. |
51 | extra_inst_bytes: usize, |
52 | } |
53 | |
54 | impl Compiler { |
55 | /// Create a new regular expression compiler. |
56 | /// |
57 | /// Various options can be set before calling `compile` on an expression. |
58 | pub fn new() -> Self { |
59 | Compiler { |
60 | insts: vec![], |
61 | compiled: Program::new(), |
62 | capture_name_idx: HashMap::new(), |
63 | num_exprs: 0, |
64 | size_limit: 10 * (1 << 20), |
65 | suffix_cache: SuffixCache::new(1000), |
66 | utf8_seqs: Some(Utf8Sequences::new(' \x00' , ' \x00' )), |
67 | byte_classes: ByteClassSet::new(), |
68 | extra_inst_bytes: 0, |
69 | } |
70 | } |
71 | |
72 | /// The size of the resulting program is limited by size_limit. If |
73 | /// the program approximately exceeds the given size (in bytes), then |
74 | /// compilation will stop and return an error. |
75 | pub fn size_limit(mut self, size_limit: usize) -> Self { |
76 | self.size_limit = size_limit; |
77 | self |
78 | } |
79 | |
80 | /// If bytes is true, then the program is compiled as a byte based |
81 | /// automaton, which incorporates UTF-8 decoding into the machine. If it's |
82 | /// false, then the automaton is Unicode scalar value based, e.g., an |
83 | /// engine utilizing such an automaton is responsible for UTF-8 decoding. |
84 | /// |
85 | /// The specific invariant is that when returning a byte based machine, |
86 | /// the neither the `Char` nor `Ranges` instructions are produced. |
87 | /// Conversely, when producing a Unicode scalar value machine, the `Bytes` |
88 | /// instruction is never produced. |
89 | /// |
90 | /// Note that `dfa(true)` implies `bytes(true)`. |
91 | pub fn bytes(mut self, yes: bool) -> Self { |
92 | self.compiled.is_bytes = yes; |
93 | self |
94 | } |
95 | |
96 | /// When disabled, the program compiled may match arbitrary bytes. |
97 | /// |
98 | /// When enabled (the default), all compiled programs exclusively match |
99 | /// valid UTF-8 bytes. |
100 | pub fn only_utf8(mut self, yes: bool) -> Self { |
101 | self.compiled.only_utf8 = yes; |
102 | self |
103 | } |
104 | |
105 | /// When set, the machine returned is suitable for use in the DFA matching |
106 | /// engine. |
107 | /// |
108 | /// In particular, this ensures that if the regex is not anchored in the |
109 | /// beginning, then a preceding `.*?` is included in the program. (The NFA |
110 | /// based engines handle the preceding `.*?` explicitly, which is difficult |
111 | /// or impossible in the DFA engine.) |
112 | pub fn dfa(mut self, yes: bool) -> Self { |
113 | self.compiled.is_dfa = yes; |
114 | self |
115 | } |
116 | |
117 | /// When set, the machine returned is suitable for matching text in |
118 | /// reverse. In particular, all concatenations are flipped. |
119 | pub fn reverse(mut self, yes: bool) -> Self { |
120 | self.compiled.is_reverse = yes; |
121 | self |
122 | } |
123 | |
124 | /// Compile a regular expression given its AST. |
125 | /// |
126 | /// The compiler is guaranteed to succeed unless the program exceeds the |
127 | /// specified size limit. If the size limit is exceeded, then compilation |
128 | /// stops and returns an error. |
129 | pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> { |
130 | debug_assert!(!exprs.is_empty()); |
131 | self.num_exprs = exprs.len(); |
132 | if exprs.len() == 1 { |
133 | self.compile_one(&exprs[0]) |
134 | } else { |
135 | self.compile_many(exprs) |
136 | } |
137 | } |
138 | |
139 | fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> { |
140 | if self.compiled.only_utf8 |
141 | && expr.properties().look_set().contains(Look::WordAsciiNegate) |
142 | { |
143 | return Err(Error::Syntax( |
144 | "ASCII-only \\B is not allowed in Unicode regexes \ |
145 | because it may result in invalid UTF-8 matches" |
146 | .to_string(), |
147 | )); |
148 | } |
149 | // If we're compiling a forward DFA and we aren't anchored, then |
150 | // add a `.*?` before the first capture group. |
151 | // Other matching engines handle this by baking the logic into the |
152 | // matching engine itself. |
153 | let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; |
154 | self.compiled.is_anchored_start = |
155 | expr.properties().look_set_prefix().contains(Look::Start); |
156 | self.compiled.is_anchored_end = |
157 | expr.properties().look_set_suffix().contains(Look::End); |
158 | if self.compiled.needs_dotstar() { |
159 | dotstar_patch = self.c_dotstar()?; |
160 | self.compiled.start = dotstar_patch.entry; |
161 | } |
162 | self.compiled.captures = vec![None]; |
163 | let patch = |
164 | self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); |
165 | if self.compiled.needs_dotstar() { |
166 | self.fill(dotstar_patch.hole, patch.entry); |
167 | } else { |
168 | self.compiled.start = patch.entry; |
169 | } |
170 | self.fill_to_next(patch.hole); |
171 | self.compiled.matches = vec![self.insts.len()]; |
172 | self.push_compiled(Inst::Match(0)); |
173 | self.compiled.static_captures_len = |
174 | expr.properties().static_explicit_captures_len(); |
175 | self.compile_finish() |
176 | } |
177 | |
178 | fn compile_many( |
179 | mut self, |
180 | exprs: &[Hir], |
181 | ) -> result::Result<Program, Error> { |
182 | debug_assert!(exprs.len() > 1); |
183 | |
184 | self.compiled.is_anchored_start = exprs |
185 | .iter() |
186 | .all(|e| e.properties().look_set_prefix().contains(Look::Start)); |
187 | self.compiled.is_anchored_end = exprs |
188 | .iter() |
189 | .all(|e| e.properties().look_set_suffix().contains(Look::End)); |
190 | let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; |
191 | if self.compiled.needs_dotstar() { |
192 | dotstar_patch = self.c_dotstar()?; |
193 | self.compiled.start = dotstar_patch.entry; |
194 | } else { |
195 | self.compiled.start = 0; // first instruction is always split |
196 | } |
197 | self.fill_to_next(dotstar_patch.hole); |
198 | |
199 | let mut prev_hole = Hole::None; |
200 | for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() { |
201 | self.fill_to_next(prev_hole); |
202 | let split = self.push_split_hole(); |
203 | let Patch { hole, entry } = |
204 | self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); |
205 | self.fill_to_next(hole); |
206 | self.compiled.matches.push(self.insts.len()); |
207 | self.push_compiled(Inst::Match(i)); |
208 | prev_hole = self.fill_split(split, Some(entry), None); |
209 | } |
210 | let i = exprs.len() - 1; |
211 | let Patch { hole, entry } = |
212 | self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst()); |
213 | self.fill(prev_hole, entry); |
214 | self.fill_to_next(hole); |
215 | self.compiled.matches.push(self.insts.len()); |
216 | self.push_compiled(Inst::Match(i)); |
217 | self.compile_finish() |
218 | } |
219 | |
220 | fn compile_finish(mut self) -> result::Result<Program, Error> { |
221 | self.compiled.insts = |
222 | self.insts.into_iter().map(|inst| inst.unwrap()).collect(); |
223 | self.compiled.byte_classes = self.byte_classes.byte_classes(); |
224 | self.compiled.capture_name_idx = Arc::new(self.capture_name_idx); |
225 | Ok(self.compiled) |
226 | } |
227 | |
228 | /// Compile expr into self.insts, returning a patch on success, |
229 | /// or an error if we run out of memory. |
230 | /// |
231 | /// All of the c_* methods of the compiler share the contract outlined |
232 | /// here. |
233 | /// |
234 | /// The main thing that a c_* method does is mutate `self.insts` |
235 | /// to add a list of mostly compiled instructions required to execute |
236 | /// the given expression. `self.insts` contains MaybeInsts rather than |
237 | /// Insts because there is some backpatching required. |
238 | /// |
239 | /// The `Patch` value returned by each c_* method provides metadata |
240 | /// about the compiled instructions emitted to `self.insts`. The |
241 | /// `entry` member of the patch refers to the first instruction |
242 | /// (the entry point), while the `hole` member contains zero or |
243 | /// more offsets to partial instructions that need to be backpatched. |
244 | /// The c_* routine can't know where its list of instructions are going to |
245 | /// jump to after execution, so it is up to the caller to patch |
246 | /// these jumps to point to the right place. So compiling some |
247 | /// expression, e, we would end up with a situation that looked like: |
248 | /// |
249 | /// ```text |
250 | /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...] |
251 | /// ^ ^ ^ |
252 | /// | \ / |
253 | /// entry \ / |
254 | /// hole |
255 | /// ``` |
256 | /// |
257 | /// To compile two expressions, e1 and e2, concatenated together we |
258 | /// would do: |
259 | /// |
260 | /// ```ignore |
261 | /// let patch1 = self.c(e1); |
262 | /// let patch2 = self.c(e2); |
263 | /// ``` |
264 | /// |
265 | /// while leaves us with a situation that looks like |
266 | /// |
267 | /// ```text |
268 | /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ] |
269 | /// ^ ^ ^ ^ |
270 | /// | | | | |
271 | /// entry1 hole1 entry2 hole2 |
272 | /// ``` |
273 | /// |
274 | /// Then to merge the two patches together into one we would backpatch |
275 | /// hole1 with entry2 and return a new patch that enters at entry1 |
276 | /// and has hole2 for a hole. In fact, if you look at the c_concat |
277 | /// method you will see that it does exactly this, though it handles |
278 | /// a list of expressions rather than just the two that we use for |
279 | /// an example. |
280 | /// |
281 | /// Ok(None) is returned when an expression is compiled to no |
282 | /// instruction, and so no patch.entry value makes sense. |
283 | fn c(&mut self, expr: &Hir) -> ResultOrEmpty { |
284 | use crate::prog; |
285 | use regex_syntax::hir::HirKind::*; |
286 | |
287 | self.check_size()?; |
288 | match *expr.kind() { |
289 | Empty => self.c_empty(), |
290 | Literal(hir::Literal(ref bytes)) => { |
291 | if self.compiled.is_reverse { |
292 | let mut bytes = bytes.to_vec(); |
293 | bytes.reverse(); |
294 | self.c_literal(&bytes) |
295 | } else { |
296 | self.c_literal(bytes) |
297 | } |
298 | } |
299 | Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()), |
300 | Class(hir::Class::Bytes(ref cls)) => { |
301 | if self.compiled.uses_bytes() { |
302 | self.c_class_bytes(cls.ranges()) |
303 | } else { |
304 | assert!(cls.is_ascii()); |
305 | let mut char_ranges = vec![]; |
306 | for r in cls.iter() { |
307 | let (s, e) = (r.start() as char, r.end() as char); |
308 | char_ranges.push(hir::ClassUnicodeRange::new(s, e)); |
309 | } |
310 | self.c_class(&char_ranges) |
311 | } |
312 | } |
313 | Look(ref look) => match *look { |
314 | hir::Look::Start if self.compiled.is_reverse => { |
315 | self.c_empty_look(prog::EmptyLook::EndText) |
316 | } |
317 | hir::Look::Start => { |
318 | self.c_empty_look(prog::EmptyLook::StartText) |
319 | } |
320 | hir::Look::End if self.compiled.is_reverse => { |
321 | self.c_empty_look(prog::EmptyLook::StartText) |
322 | } |
323 | hir::Look::End => self.c_empty_look(prog::EmptyLook::EndText), |
324 | hir::Look::StartLF if self.compiled.is_reverse => { |
325 | self.byte_classes.set_range(b' \n' , b' \n' ); |
326 | self.c_empty_look(prog::EmptyLook::EndLine) |
327 | } |
328 | hir::Look::StartLF => { |
329 | self.byte_classes.set_range(b' \n' , b' \n' ); |
330 | self.c_empty_look(prog::EmptyLook::StartLine) |
331 | } |
332 | hir::Look::EndLF if self.compiled.is_reverse => { |
333 | self.byte_classes.set_range(b' \n' , b' \n' ); |
334 | self.c_empty_look(prog::EmptyLook::StartLine) |
335 | } |
336 | hir::Look::EndLF => { |
337 | self.byte_classes.set_range(b' \n' , b' \n' ); |
338 | self.c_empty_look(prog::EmptyLook::EndLine) |
339 | } |
340 | hir::Look::StartCRLF | hir::Look::EndCRLF => { |
341 | return Err(Error::Syntax( |
342 | "CRLF-aware line anchors are not supported yet" |
343 | .to_string(), |
344 | )); |
345 | } |
346 | hir::Look::WordAscii => { |
347 | self.byte_classes.set_word_boundary(); |
348 | self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) |
349 | } |
350 | hir::Look::WordAsciiNegate => { |
351 | self.byte_classes.set_word_boundary(); |
352 | self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) |
353 | } |
354 | hir::Look::WordUnicode => { |
355 | if !cfg!(feature = "unicode-perl" ) { |
356 | return Err(Error::Syntax( |
357 | "Unicode word boundaries are unavailable when \ |
358 | the unicode-perl feature is disabled" |
359 | .to_string(), |
360 | )); |
361 | } |
362 | self.compiled.has_unicode_word_boundary = true; |
363 | self.byte_classes.set_word_boundary(); |
364 | // We also make sure that all ASCII bytes are in a different |
365 | // class from non-ASCII bytes. Otherwise, it's possible for |
366 | // ASCII bytes to get lumped into the same class as non-ASCII |
367 | // bytes. This in turn may cause the lazy DFA to falsely start |
368 | // when it sees an ASCII byte that maps to a byte class with |
369 | // non-ASCII bytes. This ensures that never happens. |
370 | self.byte_classes.set_range(0, 0x7F); |
371 | self.c_empty_look(prog::EmptyLook::WordBoundary) |
372 | } |
373 | hir::Look::WordUnicodeNegate => { |
374 | if !cfg!(feature = "unicode-perl" ) { |
375 | return Err(Error::Syntax( |
376 | "Unicode word boundaries are unavailable when \ |
377 | the unicode-perl feature is disabled" |
378 | .to_string(), |
379 | )); |
380 | } |
381 | self.compiled.has_unicode_word_boundary = true; |
382 | self.byte_classes.set_word_boundary(); |
383 | // See comments above for why we set the ASCII range here. |
384 | self.byte_classes.set_range(0, 0x7F); |
385 | self.c_empty_look(prog::EmptyLook::NotWordBoundary) |
386 | } |
387 | }, |
388 | Capture(hir::Capture { index, ref name, ref sub }) => { |
389 | if index as usize >= self.compiled.captures.len() { |
390 | let name = match *name { |
391 | None => None, |
392 | Some(ref boxed_str) => Some(boxed_str.to_string()), |
393 | }; |
394 | self.compiled.captures.push(name.clone()); |
395 | if let Some(name) = name { |
396 | self.capture_name_idx.insert(name, index as usize); |
397 | } |
398 | } |
399 | self.c_capture(2 * index as usize, sub) |
400 | } |
401 | Concat(ref es) => { |
402 | if self.compiled.is_reverse { |
403 | self.c_concat(es.iter().rev()) |
404 | } else { |
405 | self.c_concat(es) |
406 | } |
407 | } |
408 | Alternation(ref es) => self.c_alternate(&**es), |
409 | Repetition(ref rep) => self.c_repeat(rep), |
410 | } |
411 | } |
412 | |
413 | fn c_empty(&mut self) -> ResultOrEmpty { |
414 | // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 |
415 | // See: CVE-2022-24713 |
416 | // |
417 | // Since 'empty' sub-expressions don't increase the size of |
418 | // the actual compiled object, we "fake" an increase in its |
419 | // size so that our 'check_size_limit' routine will eventually |
420 | // stop compilation if there are too many empty sub-expressions |
421 | // (e.g., via a large repetition). |
422 | self.extra_inst_bytes += std::mem::size_of::<Inst>(); |
423 | Ok(None) |
424 | } |
425 | |
426 | fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty { |
427 | if self.num_exprs > 1 || self.compiled.is_dfa { |
428 | // Don't ever compile Save instructions for regex sets because |
429 | // they are never used. They are also never used in DFA programs |
430 | // because DFAs can't handle captures. |
431 | self.c(expr) |
432 | } else { |
433 | let entry = self.insts.len(); |
434 | let hole = self.push_hole(InstHole::Save { slot: first_slot }); |
435 | let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst()); |
436 | self.fill(hole, patch.entry); |
437 | self.fill_to_next(patch.hole); |
438 | let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 }); |
439 | Ok(Some(Patch { hole, entry })) |
440 | } |
441 | } |
442 | |
443 | fn c_dotstar(&mut self) -> Result { |
444 | let hir = if self.compiled.only_utf8() { |
445 | Hir::dot(hir::Dot::AnyChar) |
446 | } else { |
447 | Hir::dot(hir::Dot::AnyByte) |
448 | }; |
449 | Ok(self |
450 | .c(&Hir::repetition(hir::Repetition { |
451 | min: 0, |
452 | max: None, |
453 | greedy: false, |
454 | sub: Box::new(hir), |
455 | }))? |
456 | .unwrap()) |
457 | } |
458 | |
459 | fn c_char(&mut self, c: char) -> ResultOrEmpty { |
460 | if self.compiled.uses_bytes() { |
461 | if c.is_ascii() { |
462 | let b = c as u8; |
463 | let hole = |
464 | self.push_hole(InstHole::Bytes { start: b, end: b }); |
465 | self.byte_classes.set_range(b, b); |
466 | Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) |
467 | } else { |
468 | self.c_class(&[hir::ClassUnicodeRange::new(c, c)]) |
469 | } |
470 | } else { |
471 | let hole = self.push_hole(InstHole::Char { c }); |
472 | Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) |
473 | } |
474 | } |
475 | |
476 | fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { |
477 | use std::mem::size_of; |
478 | |
479 | if ranges.is_empty() { |
480 | return Err(Error::Syntax( |
481 | "empty character classes are not allowed" .to_string(), |
482 | )); |
483 | } |
484 | if self.compiled.uses_bytes() { |
485 | Ok(Some(CompileClass { c: self, ranges }.compile()?)) |
486 | } else { |
487 | let ranges: Vec<(char, char)> = |
488 | ranges.iter().map(|r| (r.start(), r.end())).collect(); |
489 | let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 { |
490 | self.push_hole(InstHole::Char { c: ranges[0].0 }) |
491 | } else { |
492 | self.extra_inst_bytes += |
493 | ranges.len() * (size_of::<char>() * 2); |
494 | self.push_hole(InstHole::Ranges { ranges }) |
495 | }; |
496 | Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) |
497 | } |
498 | } |
499 | |
500 | fn c_byte(&mut self, b: u8) -> ResultOrEmpty { |
501 | self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)]) |
502 | } |
503 | |
504 | fn c_class_bytes( |
505 | &mut self, |
506 | ranges: &[hir::ClassBytesRange], |
507 | ) -> ResultOrEmpty { |
508 | if ranges.is_empty() { |
509 | return Err(Error::Syntax( |
510 | "empty character classes are not allowed" .to_string(), |
511 | )); |
512 | } |
513 | |
514 | let first_split_entry = self.insts.len(); |
515 | let mut holes = vec![]; |
516 | let mut prev_hole = Hole::None; |
517 | for r in &ranges[0..ranges.len() - 1] { |
518 | self.fill_to_next(prev_hole); |
519 | let split = self.push_split_hole(); |
520 | let next = self.insts.len(); |
521 | self.byte_classes.set_range(r.start(), r.end()); |
522 | holes.push(self.push_hole(InstHole::Bytes { |
523 | start: r.start(), |
524 | end: r.end(), |
525 | })); |
526 | prev_hole = self.fill_split(split, Some(next), None); |
527 | } |
528 | let next = self.insts.len(); |
529 | let r = &ranges[ranges.len() - 1]; |
530 | self.byte_classes.set_range(r.start(), r.end()); |
531 | holes.push( |
532 | self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }), |
533 | ); |
534 | self.fill(prev_hole, next); |
535 | Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) |
536 | } |
537 | |
538 | fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty { |
539 | let hole = self.push_hole(InstHole::EmptyLook { look }); |
540 | Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) |
541 | } |
542 | |
543 | fn c_literal(&mut self, bytes: &[u8]) -> ResultOrEmpty { |
544 | match core::str::from_utf8(bytes) { |
545 | Ok(string) => { |
546 | let mut it = string.chars(); |
547 | let Patch { mut hole, entry } = loop { |
548 | match it.next() { |
549 | None => return self.c_empty(), |
550 | Some(ch) => { |
551 | if let Some(p) = self.c_char(ch)? { |
552 | break p; |
553 | } |
554 | } |
555 | } |
556 | }; |
557 | for ch in it { |
558 | if let Some(p) = self.c_char(ch)? { |
559 | self.fill(hole, p.entry); |
560 | hole = p.hole; |
561 | } |
562 | } |
563 | Ok(Some(Patch { hole, entry })) |
564 | } |
565 | Err(_) => { |
566 | assert!(self.compiled.uses_bytes()); |
567 | let mut it = bytes.iter().copied(); |
568 | let Patch { mut hole, entry } = loop { |
569 | match it.next() { |
570 | None => return self.c_empty(), |
571 | Some(byte) => { |
572 | if let Some(p) = self.c_byte(byte)? { |
573 | break p; |
574 | } |
575 | } |
576 | } |
577 | }; |
578 | for byte in it { |
579 | if let Some(p) = self.c_byte(byte)? { |
580 | self.fill(hole, p.entry); |
581 | hole = p.hole; |
582 | } |
583 | } |
584 | Ok(Some(Patch { hole, entry })) |
585 | } |
586 | } |
587 | } |
588 | |
589 | fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty |
590 | where |
591 | I: IntoIterator<Item = &'a Hir>, |
592 | { |
593 | let mut exprs = exprs.into_iter(); |
594 | let Patch { mut hole, entry } = loop { |
595 | match exprs.next() { |
596 | None => return self.c_empty(), |
597 | Some(e) => { |
598 | if let Some(p) = self.c(e)? { |
599 | break p; |
600 | } |
601 | } |
602 | } |
603 | }; |
604 | for e in exprs { |
605 | if let Some(p) = self.c(e)? { |
606 | self.fill(hole, p.entry); |
607 | hole = p.hole; |
608 | } |
609 | } |
610 | Ok(Some(Patch { hole, entry })) |
611 | } |
612 | |
613 | fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty { |
614 | debug_assert!( |
615 | exprs.len() >= 2, |
616 | "alternates must have at least 2 exprs" |
617 | ); |
618 | |
619 | // Initial entry point is always the first split. |
620 | let first_split_entry = self.insts.len(); |
621 | |
622 | // Save up all of the holes from each alternate. They will all get |
623 | // patched to point to the same location. |
624 | let mut holes = vec![]; |
625 | |
626 | // true indicates that the hole is a split where we want to fill |
627 | // the second branch. |
628 | let mut prev_hole = (Hole::None, false); |
629 | for e in &exprs[0..exprs.len() - 1] { |
630 | if prev_hole.1 { |
631 | let next = self.insts.len(); |
632 | self.fill_split(prev_hole.0, None, Some(next)); |
633 | } else { |
634 | self.fill_to_next(prev_hole.0); |
635 | } |
636 | let split = self.push_split_hole(); |
637 | if let Some(Patch { hole, entry }) = self.c(e)? { |
638 | holes.push(hole); |
639 | prev_hole = (self.fill_split(split, Some(entry), None), false); |
640 | } else { |
641 | let (split1, split2) = split.dup_one(); |
642 | holes.push(split1); |
643 | prev_hole = (split2, true); |
644 | } |
645 | } |
646 | if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? { |
647 | holes.push(hole); |
648 | if prev_hole.1 { |
649 | self.fill_split(prev_hole.0, None, Some(entry)); |
650 | } else { |
651 | self.fill(prev_hole.0, entry); |
652 | } |
653 | } else { |
654 | // We ignore prev_hole.1. When it's true, it means we have two |
655 | // empty branches both pushing prev_hole.0 into holes, so both |
656 | // branches will go to the same place anyway. |
657 | holes.push(prev_hole.0); |
658 | } |
659 | Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) |
660 | } |
661 | |
662 | fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { |
663 | match (rep.min, rep.max) { |
664 | (0, Some(1)) => self.c_repeat_zero_or_one(&rep.sub, rep.greedy), |
665 | (0, None) => self.c_repeat_zero_or_more(&rep.sub, rep.greedy), |
666 | (1, None) => self.c_repeat_one_or_more(&rep.sub, rep.greedy), |
667 | (min, None) => { |
668 | self.c_repeat_range_min_or_more(&rep.sub, rep.greedy, min) |
669 | } |
670 | (min, Some(max)) => { |
671 | self.c_repeat_range(&rep.sub, rep.greedy, min, max) |
672 | } |
673 | } |
674 | } |
675 | |
676 | fn c_repeat_zero_or_one( |
677 | &mut self, |
678 | expr: &Hir, |
679 | greedy: bool, |
680 | ) -> ResultOrEmpty { |
681 | let split_entry = self.insts.len(); |
682 | let split = self.push_split_hole(); |
683 | let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { |
684 | Some(p) => p, |
685 | None => return self.pop_split_hole(), |
686 | }; |
687 | let split_hole = if greedy { |
688 | self.fill_split(split, Some(entry_rep), None) |
689 | } else { |
690 | self.fill_split(split, None, Some(entry_rep)) |
691 | }; |
692 | let holes = vec![hole_rep, split_hole]; |
693 | Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry })) |
694 | } |
695 | |
696 | fn c_repeat_zero_or_more( |
697 | &mut self, |
698 | expr: &Hir, |
699 | greedy: bool, |
700 | ) -> ResultOrEmpty { |
701 | let split_entry = self.insts.len(); |
702 | let split = self.push_split_hole(); |
703 | let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { |
704 | Some(p) => p, |
705 | None => return self.pop_split_hole(), |
706 | }; |
707 | |
708 | self.fill(hole_rep, split_entry); |
709 | let split_hole = if greedy { |
710 | self.fill_split(split, Some(entry_rep), None) |
711 | } else { |
712 | self.fill_split(split, None, Some(entry_rep)) |
713 | }; |
714 | Ok(Some(Patch { hole: split_hole, entry: split_entry })) |
715 | } |
716 | |
717 | fn c_repeat_one_or_more( |
718 | &mut self, |
719 | expr: &Hir, |
720 | greedy: bool, |
721 | ) -> ResultOrEmpty { |
722 | let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { |
723 | Some(p) => p, |
724 | None => return Ok(None), |
725 | }; |
726 | self.fill_to_next(hole_rep); |
727 | let split = self.push_split_hole(); |
728 | |
729 | let split_hole = if greedy { |
730 | self.fill_split(split, Some(entry_rep), None) |
731 | } else { |
732 | self.fill_split(split, None, Some(entry_rep)) |
733 | }; |
734 | Ok(Some(Patch { hole: split_hole, entry: entry_rep })) |
735 | } |
736 | |
737 | fn c_repeat_range_min_or_more( |
738 | &mut self, |
739 | expr: &Hir, |
740 | greedy: bool, |
741 | min: u32, |
742 | ) -> ResultOrEmpty { |
743 | let min = u32_to_usize(min); |
744 | // Using next_inst() is ok, because we can't return it (concat would |
745 | // have to return Some(_) while c_repeat_range_min_or_more returns |
746 | // None). |
747 | let patch_concat = self |
748 | .c_concat(iter::repeat(expr).take(min))? |
749 | .unwrap_or_else(|| self.next_inst()); |
750 | if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? { |
751 | self.fill(patch_concat.hole, patch_rep.entry); |
752 | Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry })) |
753 | } else { |
754 | Ok(None) |
755 | } |
756 | } |
757 | |
758 | fn c_repeat_range( |
759 | &mut self, |
760 | expr: &Hir, |
761 | greedy: bool, |
762 | min: u32, |
763 | max: u32, |
764 | ) -> ResultOrEmpty { |
765 | let (min, max) = (u32_to_usize(min), u32_to_usize(max)); |
766 | debug_assert!(min <= max); |
767 | let patch_concat = self.c_concat(iter::repeat(expr).take(min))?; |
768 | if min == max { |
769 | return Ok(patch_concat); |
770 | } |
771 | // Same reasoning as in c_repeat_range_min_or_more (we know that min < |
772 | // max at this point). |
773 | let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst()); |
774 | let initial_entry = patch_concat.entry; |
775 | // It is much simpler to compile, e.g., `a{2,5}` as: |
776 | // |
777 | // aaa?a?a? |
778 | // |
779 | // But you end up with a sequence of instructions like this: |
780 | // |
781 | // 0: 'a' |
782 | // 1: 'a', |
783 | // 2: split(3, 4) |
784 | // 3: 'a' |
785 | // 4: split(5, 6) |
786 | // 5: 'a' |
787 | // 6: split(7, 8) |
788 | // 7: 'a' |
789 | // 8: MATCH |
790 | // |
791 | // This is *incredibly* inefficient because the splits end |
792 | // up forming a chain, which has to be resolved everything a |
793 | // transition is followed. |
794 | let mut holes = vec![]; |
795 | let mut prev_hole = patch_concat.hole; |
796 | for _ in min..max { |
797 | self.fill_to_next(prev_hole); |
798 | let split = self.push_split_hole(); |
799 | let Patch { hole, entry } = match self.c(expr)? { |
800 | Some(p) => p, |
801 | None => return self.pop_split_hole(), |
802 | }; |
803 | prev_hole = hole; |
804 | if greedy { |
805 | holes.push(self.fill_split(split, Some(entry), None)); |
806 | } else { |
807 | holes.push(self.fill_split(split, None, Some(entry))); |
808 | } |
809 | } |
810 | holes.push(prev_hole); |
811 | Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry })) |
812 | } |
813 | |
814 | /// Can be used as a default value for the c_* functions when the call to |
815 | /// c_function is followed by inserting at least one instruction that is |
816 | /// always executed after the ones written by the c* function. |
817 | fn next_inst(&self) -> Patch { |
818 | Patch { hole: Hole::None, entry: self.insts.len() } |
819 | } |
820 | |
821 | fn fill(&mut self, hole: Hole, goto: InstPtr) { |
822 | match hole { |
823 | Hole::None => {} |
824 | Hole::One(pc) => { |
825 | self.insts[pc].fill(goto); |
826 | } |
827 | Hole::Many(holes) => { |
828 | for hole in holes { |
829 | self.fill(hole, goto); |
830 | } |
831 | } |
832 | } |
833 | } |
834 | |
835 | fn fill_to_next(&mut self, hole: Hole) { |
836 | let next = self.insts.len(); |
837 | self.fill(hole, next); |
838 | } |
839 | |
840 | fn fill_split( |
841 | &mut self, |
842 | hole: Hole, |
843 | goto1: Option<InstPtr>, |
844 | goto2: Option<InstPtr>, |
845 | ) -> Hole { |
846 | match hole { |
847 | Hole::None => Hole::None, |
848 | Hole::One(pc) => match (goto1, goto2) { |
849 | (Some(goto1), Some(goto2)) => { |
850 | self.insts[pc].fill_split(goto1, goto2); |
851 | Hole::None |
852 | } |
853 | (Some(goto1), None) => { |
854 | self.insts[pc].half_fill_split_goto1(goto1); |
855 | Hole::One(pc) |
856 | } |
857 | (None, Some(goto2)) => { |
858 | self.insts[pc].half_fill_split_goto2(goto2); |
859 | Hole::One(pc) |
860 | } |
861 | (None, None) => unreachable!( |
862 | "at least one of the split \ |
863 | holes must be filled" |
864 | ), |
865 | }, |
866 | Hole::Many(holes) => { |
867 | let mut new_holes = vec![]; |
868 | for hole in holes { |
869 | new_holes.push(self.fill_split(hole, goto1, goto2)); |
870 | } |
871 | if new_holes.is_empty() { |
872 | Hole::None |
873 | } else if new_holes.len() == 1 { |
874 | new_holes.pop().unwrap() |
875 | } else { |
876 | Hole::Many(new_holes) |
877 | } |
878 | } |
879 | } |
880 | } |
881 | |
882 | fn push_compiled(&mut self, inst: Inst) { |
883 | self.insts.push(MaybeInst::Compiled(inst)); |
884 | } |
885 | |
886 | fn push_hole(&mut self, inst: InstHole) -> Hole { |
887 | let hole = self.insts.len(); |
888 | self.insts.push(MaybeInst::Uncompiled(inst)); |
889 | Hole::One(hole) |
890 | } |
891 | |
892 | fn push_split_hole(&mut self) -> Hole { |
893 | let hole = self.insts.len(); |
894 | self.insts.push(MaybeInst::Split); |
895 | Hole::One(hole) |
896 | } |
897 | |
898 | fn pop_split_hole(&mut self) -> ResultOrEmpty { |
899 | self.insts.pop(); |
900 | Ok(None) |
901 | } |
902 | |
903 | fn check_size(&self) -> result::Result<(), Error> { |
904 | use std::mem::size_of; |
905 | |
906 | let size = |
907 | self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>()); |
908 | if size > self.size_limit { |
909 | Err(Error::CompiledTooBig(self.size_limit)) |
910 | } else { |
911 | Ok(()) |
912 | } |
913 | } |
914 | } |
915 | |
916 | #[derive (Debug)] |
917 | enum Hole { |
918 | None, |
919 | One(InstPtr), |
920 | Many(Vec<Hole>), |
921 | } |
922 | |
923 | impl Hole { |
924 | fn dup_one(self) -> (Self, Self) { |
925 | match self { |
926 | Hole::One(pc: usize) => (Hole::One(pc), Hole::One(pc)), |
927 | Hole::None | Hole::Many(_) => { |
928 | unreachable!("must be called on single hole" ) |
929 | } |
930 | } |
931 | } |
932 | } |
933 | |
934 | #[derive (Clone, Debug)] |
935 | enum MaybeInst { |
936 | Compiled(Inst), |
937 | Uncompiled(InstHole), |
938 | Split, |
939 | Split1(InstPtr), |
940 | Split2(InstPtr), |
941 | } |
942 | |
943 | impl MaybeInst { |
944 | fn fill(&mut self, goto: InstPtr) { |
945 | let maybeinst = match *self { |
946 | MaybeInst::Split => MaybeInst::Split1(goto), |
947 | MaybeInst::Uncompiled(ref inst) => { |
948 | MaybeInst::Compiled(inst.fill(goto)) |
949 | } |
950 | MaybeInst::Split1(goto1) => { |
951 | MaybeInst::Compiled(Inst::Split(InstSplit { |
952 | goto1, |
953 | goto2: goto, |
954 | })) |
955 | } |
956 | MaybeInst::Split2(goto2) => { |
957 | MaybeInst::Compiled(Inst::Split(InstSplit { |
958 | goto1: goto, |
959 | goto2, |
960 | })) |
961 | } |
962 | _ => unreachable!( |
963 | "not all instructions were compiled! \ |
964 | found uncompiled instruction: {:?}" , |
965 | self |
966 | ), |
967 | }; |
968 | *self = maybeinst; |
969 | } |
970 | |
971 | fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) { |
972 | let filled = match *self { |
973 | MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }), |
974 | _ => unreachable!( |
975 | "must be called on Split instruction, \ |
976 | instead it was called on: {:?}" , |
977 | self |
978 | ), |
979 | }; |
980 | *self = MaybeInst::Compiled(filled); |
981 | } |
982 | |
983 | fn half_fill_split_goto1(&mut self, goto1: InstPtr) { |
984 | let half_filled = match *self { |
985 | MaybeInst::Split => goto1, |
986 | _ => unreachable!( |
987 | "must be called on Split instruction, \ |
988 | instead it was called on: {:?}" , |
989 | self |
990 | ), |
991 | }; |
992 | *self = MaybeInst::Split1(half_filled); |
993 | } |
994 | |
995 | fn half_fill_split_goto2(&mut self, goto2: InstPtr) { |
996 | let half_filled = match *self { |
997 | MaybeInst::Split => goto2, |
998 | _ => unreachable!( |
999 | "must be called on Split instruction, \ |
1000 | instead it was called on: {:?}" , |
1001 | self |
1002 | ), |
1003 | }; |
1004 | *self = MaybeInst::Split2(half_filled); |
1005 | } |
1006 | |
1007 | fn unwrap(self) -> Inst { |
1008 | match self { |
1009 | MaybeInst::Compiled(inst) => inst, |
1010 | _ => unreachable!( |
1011 | "must be called on a compiled instruction, \ |
1012 | instead it was called on: {:?}" , |
1013 | self |
1014 | ), |
1015 | } |
1016 | } |
1017 | } |
1018 | |
1019 | #[derive (Clone, Debug)] |
1020 | enum InstHole { |
1021 | Save { slot: usize }, |
1022 | EmptyLook { look: EmptyLook }, |
1023 | Char { c: char }, |
1024 | Ranges { ranges: Vec<(char, char)> }, |
1025 | Bytes { start: u8, end: u8 }, |
1026 | } |
1027 | |
1028 | impl InstHole { |
1029 | fn fill(&self, goto: InstPtr) -> Inst { |
1030 | match *self { |
1031 | InstHole::Save { slot: usize } => Inst::Save(InstSave { goto, slot }), |
1032 | InstHole::EmptyLook { look: EmptyLook } => { |
1033 | Inst::EmptyLook(InstEmptyLook { goto, look }) |
1034 | } |
1035 | InstHole::Char { c: char } => Inst::Char(InstChar { goto, c }), |
1036 | InstHole::Ranges { ref ranges: &Vec<(char, char)> } => Inst::Ranges(InstRanges { |
1037 | goto, |
1038 | ranges: ranges.clone().into_boxed_slice(), |
1039 | }), |
1040 | InstHole::Bytes { start: u8, end: u8 } => { |
1041 | Inst::Bytes(InstBytes { goto, start, end }) |
1042 | } |
1043 | } |
1044 | } |
1045 | } |
1046 | |
1047 | struct CompileClass<'a, 'b> { |
1048 | c: &'a mut Compiler, |
1049 | ranges: &'b [hir::ClassUnicodeRange], |
1050 | } |
1051 | |
1052 | impl<'a, 'b> CompileClass<'a, 'b> { |
1053 | fn compile(mut self) -> Result { |
1054 | let mut holes = vec![]; |
1055 | let mut initial_entry = None; |
1056 | let mut last_split = Hole::None; |
1057 | let mut utf8_seqs = self.c.utf8_seqs.take().unwrap(); |
1058 | self.c.suffix_cache.clear(); |
1059 | |
1060 | for (i, range) in self.ranges.iter().enumerate() { |
1061 | let is_last_range = i + 1 == self.ranges.len(); |
1062 | utf8_seqs.reset(range.start(), range.end()); |
1063 | let mut it = (&mut utf8_seqs).peekable(); |
1064 | loop { |
1065 | let utf8_seq = match it.next() { |
1066 | None => break, |
1067 | Some(utf8_seq) => utf8_seq, |
1068 | }; |
1069 | if is_last_range && it.peek().is_none() { |
1070 | let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; |
1071 | holes.push(hole); |
1072 | self.c.fill(last_split, entry); |
1073 | last_split = Hole::None; |
1074 | if initial_entry.is_none() { |
1075 | initial_entry = Some(entry); |
1076 | } |
1077 | } else { |
1078 | if initial_entry.is_none() { |
1079 | initial_entry = Some(self.c.insts.len()); |
1080 | } |
1081 | self.c.fill_to_next(last_split); |
1082 | last_split = self.c.push_split_hole(); |
1083 | let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; |
1084 | holes.push(hole); |
1085 | last_split = |
1086 | self.c.fill_split(last_split, Some(entry), None); |
1087 | } |
1088 | } |
1089 | } |
1090 | self.c.utf8_seqs = Some(utf8_seqs); |
1091 | Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() }) |
1092 | } |
1093 | |
1094 | fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result { |
1095 | if self.c.compiled.is_reverse { |
1096 | self.c_utf8_seq_(seq) |
1097 | } else { |
1098 | self.c_utf8_seq_(seq.into_iter().rev()) |
1099 | } |
1100 | } |
1101 | |
1102 | fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result |
1103 | where |
1104 | I: IntoIterator<Item = &'r Utf8Range>, |
1105 | { |
1106 | // The initial instruction for each UTF-8 sequence should be the same. |
1107 | let mut from_inst = ::std::usize::MAX; |
1108 | let mut last_hole = Hole::None; |
1109 | for byte_range in seq { |
1110 | let key = SuffixCacheKey { |
1111 | from_inst, |
1112 | start: byte_range.start, |
1113 | end: byte_range.end, |
1114 | }; |
1115 | { |
1116 | let pc = self.c.insts.len(); |
1117 | if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) { |
1118 | from_inst = cached_pc; |
1119 | continue; |
1120 | } |
1121 | } |
1122 | self.c.byte_classes.set_range(byte_range.start, byte_range.end); |
1123 | if from_inst == ::std::usize::MAX { |
1124 | last_hole = self.c.push_hole(InstHole::Bytes { |
1125 | start: byte_range.start, |
1126 | end: byte_range.end, |
1127 | }); |
1128 | } else { |
1129 | self.c.push_compiled(Inst::Bytes(InstBytes { |
1130 | goto: from_inst, |
1131 | start: byte_range.start, |
1132 | end: byte_range.end, |
1133 | })); |
1134 | } |
1135 | from_inst = self.c.insts.len().checked_sub(1).unwrap(); |
1136 | debug_assert!(from_inst < ::std::usize::MAX); |
1137 | } |
1138 | debug_assert!(from_inst < ::std::usize::MAX); |
1139 | Ok(Patch { hole: last_hole, entry: from_inst }) |
1140 | } |
1141 | } |
1142 | |
1143 | /// `SuffixCache` is a simple bounded hash map for caching suffix entries in |
1144 | /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}. |
1145 | /// The set of byte ranges looks like this: |
1146 | /// |
1147 | /// [0-7F] |
1148 | /// [C2-DF][80-BF] |
1149 | /// [E0][A0-BF][80-BF] |
1150 | /// [E1-EC][80-BF][80-BF] |
1151 | /// [ED][80-9F][80-BF] |
1152 | /// [EE-EF][80-BF][80-BF] |
1153 | /// |
1154 | /// Each line above translates to one alternate in the compiled regex program. |
1155 | /// However, all but one of the alternates end in the same suffix, which is |
1156 | /// a waste of an instruction. The suffix cache facilitates reusing them across |
1157 | /// alternates. |
1158 | /// |
1159 | /// Note that a HashMap could be trivially used for this, but we don't need its |
1160 | /// overhead. Some small bounded space (LRU style) is more than enough. |
1161 | /// |
1162 | /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html), |
1163 | /// except it uses hashes as original indices and then compares full keys for |
1164 | /// validation against `dense` array. |
1165 | #[derive (Debug)] |
1166 | struct SuffixCache { |
1167 | sparse: Box<[usize]>, |
1168 | dense: Vec<SuffixCacheEntry>, |
1169 | } |
1170 | |
1171 | #[derive (Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] |
1172 | struct SuffixCacheEntry { |
1173 | key: SuffixCacheKey, |
1174 | pc: InstPtr, |
1175 | } |
1176 | |
1177 | #[derive (Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] |
1178 | struct SuffixCacheKey { |
1179 | from_inst: InstPtr, |
1180 | start: u8, |
1181 | end: u8, |
1182 | } |
1183 | |
1184 | impl SuffixCache { |
1185 | fn new(size: usize) -> Self { |
1186 | SuffixCache { |
1187 | sparse: vec![0usize; size].into(), |
1188 | dense: Vec::with_capacity(size), |
1189 | } |
1190 | } |
1191 | |
1192 | fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> { |
1193 | let hash = self.hash(&key); |
1194 | let pos = &mut self.sparse[hash]; |
1195 | if let Some(entry) = self.dense.get(*pos) { |
1196 | if entry.key == key { |
1197 | return Some(entry.pc); |
1198 | } |
1199 | } |
1200 | *pos = self.dense.len(); |
1201 | self.dense.push(SuffixCacheEntry { key, pc }); |
1202 | None |
1203 | } |
1204 | |
1205 | fn clear(&mut self) { |
1206 | self.dense.clear(); |
1207 | } |
1208 | |
1209 | fn hash(&self, suffix: &SuffixCacheKey) -> usize { |
1210 | // Basic FNV-1a hash as described: |
1211 | // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function |
1212 | const FNV_PRIME: u64 = 1_099_511_628_211; |
1213 | let mut h = 14_695_981_039_346_656_037; |
1214 | h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME); |
1215 | h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME); |
1216 | h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME); |
1217 | (h as usize) % self.sparse.len() |
1218 | } |
1219 | } |
1220 | |
1221 | struct ByteClassSet([bool; 256]); |
1222 | |
1223 | impl ByteClassSet { |
1224 | fn new() -> Self { |
1225 | ByteClassSet([false; 256]) |
1226 | } |
1227 | |
1228 | fn set_range(&mut self, start: u8, end: u8) { |
1229 | debug_assert!(start <= end); |
1230 | if start > 0 { |
1231 | self.0[start as usize - 1] = true; |
1232 | } |
1233 | self.0[end as usize] = true; |
1234 | } |
1235 | |
1236 | fn set_word_boundary(&mut self) { |
1237 | // We need to mark all ranges of bytes whose pairs result in |
1238 | // evaluating \b differently. |
1239 | let iswb = is_word_byte; |
1240 | let mut b1: u16 = 0; |
1241 | let mut b2: u16; |
1242 | while b1 <= 255 { |
1243 | b2 = b1 + 1; |
1244 | while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) { |
1245 | b2 += 1; |
1246 | } |
1247 | self.set_range(b1 as u8, (b2 - 1) as u8); |
1248 | b1 = b2; |
1249 | } |
1250 | } |
1251 | |
1252 | fn byte_classes(&self) -> Vec<u8> { |
1253 | // N.B. If you're debugging the DFA, it's useful to simply return |
1254 | // `(0..256).collect()`, which effectively removes the byte classes |
1255 | // and makes the transitions easier to read. |
1256 | // (0usize..256).map(|x| x as u8).collect() |
1257 | let mut byte_classes = vec![0; 256]; |
1258 | let mut class = 0u8; |
1259 | let mut i = 0; |
1260 | loop { |
1261 | byte_classes[i] = class as u8; |
1262 | if i >= 255 { |
1263 | break; |
1264 | } |
1265 | if self.0[i] { |
1266 | class = class.checked_add(1).unwrap(); |
1267 | } |
1268 | i += 1; |
1269 | } |
1270 | byte_classes |
1271 | } |
1272 | } |
1273 | |
1274 | impl fmt::Debug for ByteClassSet { |
1275 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
1276 | f.debug_tuple(name:"ByteClassSet" ).field(&&self.0[..]).finish() |
1277 | } |
1278 | } |
1279 | |
1280 | fn u32_to_usize(n: u32) -> usize { |
1281 | // In case usize is less than 32 bits, we need to guard against overflow. |
1282 | // On most platforms this compiles to nothing. |
1283 | // TODO Use `std::convert::TryFrom` once it's stable. |
1284 | if (n as u64) > (::std::usize::MAX as u64) { |
1285 | panic!("BUG: {} is too big to be pointer sized" , n) |
1286 | } |
1287 | n as usize |
1288 | } |
1289 | |
1290 | #[cfg (test)] |
1291 | mod tests { |
1292 | use super::ByteClassSet; |
1293 | |
1294 | #[test ] |
1295 | fn byte_classes() { |
1296 | let mut set = ByteClassSet::new(); |
1297 | set.set_range(b'a' , b'z' ); |
1298 | let classes = set.byte_classes(); |
1299 | assert_eq!(classes[0], 0); |
1300 | assert_eq!(classes[1], 0); |
1301 | assert_eq!(classes[2], 0); |
1302 | assert_eq!(classes[b'a' as usize - 1], 0); |
1303 | assert_eq!(classes[b'a' as usize], 1); |
1304 | assert_eq!(classes[b'm' as usize], 1); |
1305 | assert_eq!(classes[b'z' as usize], 1); |
1306 | assert_eq!(classes[b'z' as usize + 1], 2); |
1307 | assert_eq!(classes[254], 2); |
1308 | assert_eq!(classes[255], 2); |
1309 | |
1310 | let mut set = ByteClassSet::new(); |
1311 | set.set_range(0, 2); |
1312 | set.set_range(4, 6); |
1313 | let classes = set.byte_classes(); |
1314 | assert_eq!(classes[0], 0); |
1315 | assert_eq!(classes[1], 0); |
1316 | assert_eq!(classes[2], 0); |
1317 | assert_eq!(classes[3], 1); |
1318 | assert_eq!(classes[4], 2); |
1319 | assert_eq!(classes[5], 2); |
1320 | assert_eq!(classes[6], 2); |
1321 | assert_eq!(classes[7], 3); |
1322 | assert_eq!(classes[255], 3); |
1323 | } |
1324 | |
1325 | #[test ] |
1326 | fn full_byte_classes() { |
1327 | let mut set = ByteClassSet::new(); |
1328 | for i in 0..256u16 { |
1329 | set.set_range(i as u8, i as u8); |
1330 | } |
1331 | assert_eq!(set.byte_classes().len(), 256); |
1332 | } |
1333 | } |
1334 | |