1 | use alloc::{boxed::Box, string::String, vec, vec::Vec}; |
2 | |
3 | use crate::{error::Error, utf8}; |
4 | |
5 | mod parse; |
6 | |
7 | /// Escapes all regular expression meta characters in `pattern`. |
8 | /// |
9 | /// The string returned may be safely used as a literal in a regular |
10 | /// expression. |
11 | pub fn escape(pattern: &str) -> String { |
12 | let mut buf: String = String::new(); |
13 | buf.reserve(additional:pattern.len()); |
14 | for ch: char in pattern.chars() { |
15 | if is_meta_character(ch) { |
16 | buf.push(ch:' \\' ); |
17 | } |
18 | buf.push(ch); |
19 | } |
20 | buf |
21 | } |
22 | |
23 | /// Returns true if the given character has significance in a regex. |
24 | /// |
25 | /// Generally speaking, these are the only characters which _must_ be escaped |
26 | /// in order to match their literal meaning. For example, to match a literal |
27 | /// `|`, one could write `\|`. Sometimes escaping isn't always necessary. For |
28 | /// example, `-` is treated as a meta character because of its significance |
29 | /// for writing ranges inside of character classes, but the regex `-` will |
30 | /// match a literal `-` because `-` has no special meaning outside of character |
31 | /// classes. |
32 | /// |
33 | /// In order to determine whether a character may be escaped at all, the |
34 | /// [`is_escapeable_character`] routine should be used. The difference between |
35 | /// `is_meta_character` and `is_escapeable_character` is that the latter will |
36 | /// return true for some characters that are _not_ meta characters. For |
37 | /// example, `%` and `\%` both match a literal `%` in all contexts. In other |
38 | /// words, `is_escapeable_character` includes "superfluous" escapes. |
39 | /// |
40 | /// Note that the set of characters for which this function returns `true` or |
41 | /// `false` is fixed and won't change in a semver compatible release. (In this |
42 | /// case, "semver compatible release" actually refers to the `regex` crate |
43 | /// itself, since reducing or expanding the set of meta characters would be a |
44 | /// breaking change for not just `regex-syntax` but also `regex` itself.) |
45 | fn is_meta_character(c: char) -> bool { |
46 | match c { |
47 | ' \\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' |
48 | | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true, |
49 | _ => false, |
50 | } |
51 | } |
52 | |
53 | /// Returns true if the given character can be escaped in a regex. |
54 | /// |
55 | /// This returns true in all cases that `is_meta_character` returns true, but |
56 | /// also returns true in some cases where `is_meta_character` returns false. |
57 | /// For example, `%` is not a meta character, but it is escapeable. That is, |
58 | /// `%` and `\%` both match a literal `%` in all contexts. |
59 | /// |
60 | /// The purpose of this routine is to provide knowledge about what characters |
61 | /// may be escaped. Namely, most regex engines permit "superfluous" escapes |
62 | /// where characters without any special significance may be escaped even |
63 | /// though there is no actual _need_ to do so. |
64 | /// |
65 | /// This will return false for some characters. For example, `e` is not |
66 | /// escapeable. Therefore, `\e` will either result in a parse error (which is |
67 | /// true today), or it could backwards compatibly evolve into a new construct |
68 | /// with its own meaning. Indeed, that is the purpose of banning _some_ |
69 | /// superfluous escapes: it provides a way to evolve the syntax in a compatible |
70 | /// manner. |
71 | fn is_escapeable_character(c: char) -> bool { |
72 | // Certainly escapeable if it's a meta character. |
73 | if is_meta_character(c) { |
74 | return true; |
75 | } |
76 | // Any character that isn't ASCII is definitely not escapeable. There's |
77 | // no real need to allow things like \☃ right? |
78 | if !c.is_ascii() { |
79 | return false; |
80 | } |
81 | // Otherwise, we basically say that everything is escapeable unless it's a |
82 | // letter or digit. Things like \3 are either octal (when enabled) or an |
83 | // error, and we should keep it that way. Otherwise, letters are reserved |
84 | // for adding new syntax in a backwards compatible way. |
85 | match c { |
86 | '0' ..='9' | 'A' ..='Z' | 'a' ..='z' => false, |
87 | // While not currently supported, we keep these as not escapeable to |
88 | // give us some flexibility with respect to supporting the \< and |
89 | // \> word boundary assertions in the future. By rejecting them as |
90 | // escapeable, \< and \> will result in a parse error. Thus, we can |
91 | // turn them into something else in the future without it being a |
92 | // backwards incompatible change. |
93 | '<' | '>' => false, |
94 | _ => true, |
95 | } |
96 | } |
97 | |
98 | /// The configuration for a regex parser. |
99 | #[derive (Clone, Copy, Debug)] |
100 | pub(crate) struct Config { |
101 | /// The maximum number of times we're allowed to recurse. |
102 | /// |
103 | /// Note that unlike the regex-syntax parser, we actually use recursion in |
104 | /// this parser for simplicity. My hope is that by setting a conservative |
105 | /// default call limit and providing a way to configure it, that we can |
106 | /// keep this simplification. But if we must, we can re-work the parser to |
107 | /// put the call stack on the heap like regex-syntax does. |
108 | pub(crate) nest_limit: u32, |
109 | /// Various flags that control how a pattern is interpreted. |
110 | pub(crate) flags: Flags, |
111 | } |
112 | |
113 | impl Default for Config { |
114 | fn default() -> Config { |
115 | Config { nest_limit: 50, flags: Flags::default() } |
116 | } |
117 | } |
118 | |
119 | /// Various flags that control the interpretation of the pattern. |
120 | /// |
121 | /// These can be set via explicit configuration in code, or change dynamically |
122 | /// during parsing via inline flags. For example, `foo(?i:bar)baz` will match |
123 | /// `foo` and `baz` case sensitiviely and `bar` case insensitively (assuming a |
124 | /// default configuration). |
125 | #[derive (Clone, Copy, Debug, Default)] |
126 | pub(crate) struct Flags { |
127 | /// Whether to match case insensitively. |
128 | /// |
129 | /// This is the `i` flag. |
130 | pub(crate) case_insensitive: bool, |
131 | /// Whether `^` and `$` should be treated as line anchors or not. |
132 | /// |
133 | /// This is the `m` flag. |
134 | pub(crate) multi_line: bool, |
135 | /// Whether `.` should match line terminators or not. |
136 | /// |
137 | /// This is the `s` flag. |
138 | pub(crate) dot_matches_new_line: bool, |
139 | /// Whether to swap the meaning of greedy and non-greedy operators. |
140 | /// |
141 | /// This is the `U` flag. |
142 | pub(crate) swap_greed: bool, |
143 | /// Whether to enable CRLF mode. |
144 | /// |
145 | /// This is the `R` flag. |
146 | pub(crate) crlf: bool, |
147 | /// Whether to ignore whitespace. i.e., verbose mode. |
148 | /// |
149 | /// This is the `x` flag. |
150 | pub(crate) ignore_whitespace: bool, |
151 | } |
152 | |
153 | #[derive (Clone, Debug, Eq, PartialEq)] |
154 | pub(crate) struct Hir { |
155 | kind: HirKind, |
156 | is_start_anchored: bool, |
157 | is_match_empty: bool, |
158 | static_explicit_captures_len: Option<usize>, |
159 | } |
160 | |
161 | #[derive (Clone, Debug, Eq, PartialEq)] |
162 | pub(crate) enum HirKind { |
163 | Empty, |
164 | Char(char), |
165 | Class(Class), |
166 | Look(Look), |
167 | Repetition(Repetition), |
168 | Capture(Capture), |
169 | Concat(Vec<Hir>), |
170 | Alternation(Vec<Hir>), |
171 | } |
172 | |
173 | impl Hir { |
174 | /// Parses the given pattern string with the given configuration into a |
175 | /// structured representation. If the pattern is invalid, then an error |
176 | /// is returned. |
177 | pub(crate) fn parse(config: Config, pattern: &str) -> Result<Hir, Error> { |
178 | self::parse::Parser::new(config, pattern).parse() |
179 | } |
180 | |
181 | /// Returns the underlying kind of this high-level intermediate |
182 | /// representation. |
183 | /// |
184 | /// Note that there is explicitly no way to build an `Hir` directly from |
185 | /// an `HirKind`. If you need to do that, then you must do case analysis |
186 | /// on the `HirKind` and call the appropriate smart constructor on `Hir`. |
187 | pub(crate) fn kind(&self) -> &HirKind { |
188 | &self.kind |
189 | } |
190 | |
191 | /// Returns true if and only if this Hir expression can only match at the |
192 | /// beginning of a haystack. |
193 | pub(crate) fn is_start_anchored(&self) -> bool { |
194 | self.is_start_anchored |
195 | } |
196 | |
197 | /// Returns true if and only if this Hir expression can match the empty |
198 | /// string. |
199 | pub(crate) fn is_match_empty(&self) -> bool { |
200 | self.is_match_empty |
201 | } |
202 | |
203 | /// If the pattern always reports the same number of matching capture groups |
204 | /// for every match, then this returns the number of those groups. This |
205 | /// doesn't include the implicit group found in every pattern. |
206 | pub(crate) fn static_explicit_captures_len(&self) -> Option<usize> { |
207 | self.static_explicit_captures_len |
208 | } |
209 | |
210 | fn fail() -> Hir { |
211 | let kind = HirKind::Class(Class { ranges: vec![] }); |
212 | Hir { |
213 | kind, |
214 | is_start_anchored: false, |
215 | is_match_empty: false, |
216 | static_explicit_captures_len: Some(0), |
217 | } |
218 | } |
219 | |
220 | fn empty() -> Hir { |
221 | let kind = HirKind::Empty; |
222 | Hir { |
223 | kind, |
224 | is_start_anchored: false, |
225 | is_match_empty: true, |
226 | static_explicit_captures_len: Some(0), |
227 | } |
228 | } |
229 | |
230 | fn char(ch: char) -> Hir { |
231 | let kind = HirKind::Char(ch); |
232 | Hir { |
233 | kind, |
234 | is_start_anchored: false, |
235 | is_match_empty: false, |
236 | static_explicit_captures_len: Some(0), |
237 | } |
238 | } |
239 | |
240 | fn class(class: Class) -> Hir { |
241 | let kind = HirKind::Class(class); |
242 | Hir { |
243 | kind, |
244 | is_start_anchored: false, |
245 | is_match_empty: false, |
246 | static_explicit_captures_len: Some(0), |
247 | } |
248 | } |
249 | |
250 | fn look(look: Look) -> Hir { |
251 | let kind = HirKind::Look(look); |
252 | Hir { |
253 | kind, |
254 | is_start_anchored: matches!(look, Look::Start), |
255 | is_match_empty: true, |
256 | static_explicit_captures_len: Some(0), |
257 | } |
258 | } |
259 | |
260 | fn repetition(rep: Repetition) -> Hir { |
261 | if rep.min == 0 && rep.max == Some(0) { |
262 | return Hir::empty(); |
263 | } else if rep.min == 1 && rep.max == Some(1) { |
264 | return *rep.sub; |
265 | } |
266 | let is_start_anchored = rep.min > 0 && rep.sub.is_start_anchored; |
267 | let is_match_empty = rep.min == 0 || rep.sub.is_match_empty; |
268 | let mut static_explicit_captures_len = |
269 | rep.sub.static_explicit_captures_len; |
270 | // If the static captures len of the sub-expression is not known or |
271 | // is greater than zero, then it automatically propagates to the |
272 | // repetition, regardless of the repetition. Otherwise, it might |
273 | // change, but only when the repetition can match 0 times. |
274 | if rep.min == 0 |
275 | && static_explicit_captures_len.map_or(false, |len| len > 0) |
276 | { |
277 | // If we require a match 0 times, then our captures len is |
278 | // guaranteed to be zero. Otherwise, if we *can* match the empty |
279 | // string, then it's impossible to know how many captures will be |
280 | // in the resulting match. |
281 | if rep.max == Some(0) { |
282 | static_explicit_captures_len = Some(0); |
283 | } else { |
284 | static_explicit_captures_len = None; |
285 | } |
286 | } |
287 | Hir { |
288 | kind: HirKind::Repetition(rep), |
289 | is_start_anchored, |
290 | is_match_empty, |
291 | static_explicit_captures_len, |
292 | } |
293 | } |
294 | |
295 | fn capture(cap: Capture) -> Hir { |
296 | let is_start_anchored = cap.sub.is_start_anchored; |
297 | let is_match_empty = cap.sub.is_match_empty; |
298 | let static_explicit_captures_len = cap |
299 | .sub |
300 | .static_explicit_captures_len |
301 | .map(|len| len.saturating_add(1)); |
302 | let kind = HirKind::Capture(cap); |
303 | Hir { |
304 | kind, |
305 | is_start_anchored, |
306 | is_match_empty, |
307 | static_explicit_captures_len, |
308 | } |
309 | } |
310 | |
311 | fn concat(mut subs: Vec<Hir>) -> Hir { |
312 | if subs.is_empty() { |
313 | Hir::empty() |
314 | } else if subs.len() == 1 { |
315 | subs.pop().unwrap() |
316 | } else { |
317 | let is_start_anchored = subs[0].is_start_anchored; |
318 | let mut is_match_empty = true; |
319 | let mut static_explicit_captures_len = Some(0usize); |
320 | for sub in subs.iter() { |
321 | is_match_empty = is_match_empty && sub.is_match_empty; |
322 | static_explicit_captures_len = static_explicit_captures_len |
323 | .and_then(|len1| { |
324 | Some((len1, sub.static_explicit_captures_len?)) |
325 | }) |
326 | .and_then(|(len1, len2)| Some(len1.saturating_add(len2))); |
327 | } |
328 | Hir { |
329 | kind: HirKind::Concat(subs), |
330 | is_start_anchored, |
331 | is_match_empty, |
332 | static_explicit_captures_len, |
333 | } |
334 | } |
335 | } |
336 | |
337 | fn alternation(mut subs: Vec<Hir>) -> Hir { |
338 | if subs.is_empty() { |
339 | Hir::fail() |
340 | } else if subs.len() == 1 { |
341 | subs.pop().unwrap() |
342 | } else { |
343 | let mut it = subs.iter().peekable(); |
344 | let mut is_start_anchored = |
345 | it.peek().map_or(false, |sub| sub.is_start_anchored); |
346 | let mut is_match_empty = |
347 | it.peek().map_or(false, |sub| sub.is_match_empty); |
348 | let mut static_explicit_captures_len = |
349 | it.peek().and_then(|sub| sub.static_explicit_captures_len); |
350 | for sub in it { |
351 | is_start_anchored = is_start_anchored && sub.is_start_anchored; |
352 | is_match_empty = is_match_empty || sub.is_match_empty; |
353 | if static_explicit_captures_len |
354 | != sub.static_explicit_captures_len |
355 | { |
356 | static_explicit_captures_len = None; |
357 | } |
358 | } |
359 | Hir { |
360 | kind: HirKind::Alternation(subs), |
361 | is_start_anchored, |
362 | is_match_empty, |
363 | static_explicit_captures_len, |
364 | } |
365 | } |
366 | } |
367 | } |
368 | |
369 | impl HirKind { |
370 | /// Returns a slice of this kind's sub-expressions, if any. |
371 | fn subs(&self) -> &[Hir] { |
372 | use core::slice::from_ref; |
373 | |
374 | match *self { |
375 | HirKind::Empty |
376 | | HirKind::Char(_) |
377 | | HirKind::Class(_) |
378 | | HirKind::Look(_) => &[], |
379 | HirKind::Repetition(Repetition { ref sub: &Box, .. }) => from_ref(sub), |
380 | HirKind::Capture(Capture { ref sub: &Box, .. }) => from_ref(sub), |
381 | HirKind::Concat(ref subs: &Vec) => subs, |
382 | HirKind::Alternation(ref subs: &Vec) => subs, |
383 | } |
384 | } |
385 | } |
386 | |
387 | #[derive (Clone, Debug, Eq, PartialEq)] |
388 | pub(crate) struct Class { |
389 | pub(crate) ranges: Vec<ClassRange>, |
390 | } |
391 | |
392 | impl Class { |
393 | /// Create a new class from the given ranges. The ranges may be provided |
394 | /// in any order or may even overlap. They will be automatically |
395 | /// canonicalized. |
396 | fn new<I: IntoIterator<Item = ClassRange>>(ranges: I) -> Class { |
397 | let mut class = Class { ranges: ranges.into_iter().collect() }; |
398 | class.canonicalize(); |
399 | class |
400 | } |
401 | |
402 | /// Expand this class such that it matches the ASCII codepoints in this set |
403 | /// case insensitively. |
404 | fn ascii_case_fold(&mut self) { |
405 | let len = self.ranges.len(); |
406 | for i in 0..len { |
407 | if let Some(folded) = self.ranges[i].ascii_case_fold() { |
408 | self.ranges.push(folded); |
409 | } |
410 | } |
411 | self.canonicalize(); |
412 | } |
413 | |
414 | /// Negate this set. |
415 | /// |
416 | /// For all `x` where `x` is any element, if `x` was in this set, then it |
417 | /// will not be in this set after negation. |
418 | fn negate(&mut self) { |
419 | const MIN: char = ' \x00' ; |
420 | const MAX: char = char::MAX; |
421 | |
422 | if self.ranges.is_empty() { |
423 | self.ranges.push(ClassRange { start: MIN, end: MAX }); |
424 | return; |
425 | } |
426 | |
427 | // There should be a way to do this in-place with constant memory, |
428 | // but I couldn't figure out a simple way to do it. So just append |
429 | // the negation to the end of this range, and then drain it before |
430 | // we're done. |
431 | let drain_end = self.ranges.len(); |
432 | |
433 | // If our class doesn't start the minimum possible char, then negation |
434 | // needs to include all codepoints up to the minimum in this set. |
435 | if self.ranges[0].start > MIN { |
436 | self.ranges.push(ClassRange { |
437 | start: MIN, |
438 | // OK because we know it's bigger than MIN. |
439 | end: prev_char(self.ranges[0].start).unwrap(), |
440 | }); |
441 | } |
442 | for i in 1..drain_end { |
443 | // let lower = self.ranges[i - 1].upper().increment(); |
444 | // let upper = self.ranges[i].lower().decrement(); |
445 | // self.ranges.push(I::create(lower, upper)); |
446 | self.ranges.push(ClassRange { |
447 | // OK because we know i-1 is never the last range and therefore |
448 | // there must be a range greater than it. It therefore follows |
449 | // that 'end' can never be char::MAX, and thus there must be |
450 | // a next char. |
451 | start: next_char(self.ranges[i - 1].end).unwrap(), |
452 | // Since 'i' is guaranteed to never be the first range, it |
453 | // follows that there is always a range before this and thus |
454 | // 'start' can never be '\x00'. Thus, there must be a previous |
455 | // char. |
456 | end: prev_char(self.ranges[i].start).unwrap(), |
457 | }); |
458 | } |
459 | if self.ranges[drain_end - 1].end < MAX { |
460 | // let lower = self.ranges[drain_end - 1].upper().increment(); |
461 | // self.ranges.push(I::create(lower, I::Bound::max_value())); |
462 | self.ranges.push(ClassRange { |
463 | // OK because we know 'end' is less than char::MAX, and thus |
464 | // there is a next char. |
465 | start: next_char(self.ranges[drain_end - 1].end).unwrap(), |
466 | end: MAX, |
467 | }); |
468 | } |
469 | self.ranges.drain(..drain_end); |
470 | // We don't need to canonicalize because we processed the ranges above |
471 | // in canonical order and the new ranges we added based on those are |
472 | // also necessarily in canonical order. |
473 | } |
474 | |
475 | /// Converts this set into a canonical ordering. |
476 | fn canonicalize(&mut self) { |
477 | if self.is_canonical() { |
478 | return; |
479 | } |
480 | self.ranges.sort(); |
481 | assert!(!self.ranges.is_empty()); |
482 | |
483 | // Is there a way to do this in-place with constant memory? I couldn't |
484 | // figure out a way to do it. So just append the canonicalization to |
485 | // the end of this range, and then drain it before we're done. |
486 | let drain_end = self.ranges.len(); |
487 | for oldi in 0..drain_end { |
488 | // If we've added at least one new range, then check if we can |
489 | // merge this range in the previously added range. |
490 | if self.ranges.len() > drain_end { |
491 | let (last, rest) = self.ranges.split_last_mut().unwrap(); |
492 | if let Some(union) = last.union(&rest[oldi]) { |
493 | *last = union; |
494 | continue; |
495 | } |
496 | } |
497 | self.ranges.push(self.ranges[oldi]); |
498 | } |
499 | self.ranges.drain(..drain_end); |
500 | } |
501 | |
502 | /// Returns true if and only if this class is in a canonical ordering. |
503 | fn is_canonical(&self) -> bool { |
504 | for pair in self.ranges.windows(2) { |
505 | if pair[0] >= pair[1] { |
506 | return false; |
507 | } |
508 | if pair[0].is_contiguous(&pair[1]) { |
509 | return false; |
510 | } |
511 | } |
512 | true |
513 | } |
514 | } |
515 | |
516 | #[derive (Clone, Copy, Debug, Eq, PartialEq, PartialOrd, Ord)] |
517 | pub(crate) struct ClassRange { |
518 | pub(crate) start: char, |
519 | pub(crate) end: char, |
520 | } |
521 | |
522 | impl ClassRange { |
523 | /// Apply simple case folding to this byte range. Only ASCII case mappings |
524 | /// (for A-Za-z) are applied. |
525 | /// |
526 | /// Additional ranges are appended to the given vector. Canonical ordering |
527 | /// is *not* maintained in the given vector. |
528 | fn ascii_case_fold(&self) -> Option<ClassRange> { |
529 | if !(ClassRange { start: 'a' , end: 'z' }).is_intersection_empty(self) { |
530 | let start = core::cmp::max(self.start, 'a' ); |
531 | let end = core::cmp::min(self.end, 'z' ); |
532 | return Some(ClassRange { |
533 | start: char::try_from(u32::from(start) - 32).unwrap(), |
534 | end: char::try_from(u32::from(end) - 32).unwrap(), |
535 | }); |
536 | } |
537 | if !(ClassRange { start: 'A' , end: 'Z' }).is_intersection_empty(self) { |
538 | let start = core::cmp::max(self.start, 'A' ); |
539 | let end = core::cmp::min(self.end, 'Z' ); |
540 | return Some(ClassRange { |
541 | start: char::try_from(u32::from(start) + 32).unwrap(), |
542 | end: char::try_from(u32::from(end) + 32).unwrap(), |
543 | }); |
544 | } |
545 | None |
546 | } |
547 | |
548 | /// Union the given overlapping range into this range. |
549 | /// |
550 | /// If the two ranges aren't contiguous, then this returns `None`. |
551 | fn union(&self, other: &ClassRange) -> Option<ClassRange> { |
552 | if !self.is_contiguous(other) { |
553 | return None; |
554 | } |
555 | let start = core::cmp::min(self.start, other.start); |
556 | let end = core::cmp::max(self.end, other.end); |
557 | Some(ClassRange { start, end }) |
558 | } |
559 | |
560 | /// Returns true if and only if the two ranges are contiguous. Two ranges |
561 | /// are contiguous if and only if the ranges are either overlapping or |
562 | /// adjacent. |
563 | fn is_contiguous(&self, other: &ClassRange) -> bool { |
564 | let (s1, e1) = (u32::from(self.start), u32::from(self.end)); |
565 | let (s2, e2) = (u32::from(other.start), u32::from(other.end)); |
566 | core::cmp::max(s1, s2) <= core::cmp::min(e1, e2).saturating_add(1) |
567 | } |
568 | |
569 | /// Returns true if and only if the intersection of this range and the |
570 | /// other range is empty. |
571 | fn is_intersection_empty(&self, other: &ClassRange) -> bool { |
572 | let (s1, e1) = (self.start, self.end); |
573 | let (s2, e2) = (other.start, other.end); |
574 | core::cmp::max(s1, s2) > core::cmp::min(e1, e2) |
575 | } |
576 | } |
577 | |
578 | /// The high-level intermediate representation for a look-around assertion. |
579 | /// |
580 | /// An assertion match is always zero-length. Also called an "empty match." |
581 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
582 | pub(crate) enum Look { |
583 | /// Match the beginning of text. Specifically, this matches at the starting |
584 | /// position of the input. |
585 | Start = 1 << 0, |
586 | /// Match the end of text. Specifically, this matches at the ending |
587 | /// position of the input. |
588 | End = 1 << 1, |
589 | /// Match the beginning of a line or the beginning of text. Specifically, |
590 | /// this matches at the starting position of the input, or at the position |
591 | /// immediately following a `\n` character. |
592 | StartLF = 1 << 2, |
593 | /// Match the end of a line or the end of text. Specifically, this matches |
594 | /// at the end position of the input, or at the position immediately |
595 | /// preceding a `\n` character. |
596 | EndLF = 1 << 3, |
597 | /// Match the beginning of a line or the beginning of text. Specifically, |
598 | /// this matches at the starting position of the input, or at the position |
599 | /// immediately following either a `\r` or `\n` character, but never after |
600 | /// a `\r` when a `\n` follows. |
601 | StartCRLF = 1 << 4, |
602 | /// Match the end of a line or the end of text. Specifically, this matches |
603 | /// at the end position of the input, or at the position immediately |
604 | /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` |
605 | /// precedes it. |
606 | EndCRLF = 1 << 5, |
607 | /// Match an ASCII-only word boundary. That is, this matches a position |
608 | /// where the left adjacent character and right adjacent character |
609 | /// correspond to a word and non-word or a non-word and word character. |
610 | Word = 1 << 6, |
611 | /// Match an ASCII-only negation of a word boundary. |
612 | WordNegate = 1 << 7, |
613 | /// Match the start of an ASCII-only word boundary. That is, this matches a |
614 | /// position at either the beginning of the haystack or where the previous |
615 | /// character is not a word character and the following character is a word |
616 | /// character. |
617 | WordStart = 1 << 8, |
618 | /// Match the end of an ASCII-only word boundary. That is, this matches |
619 | /// a position at either the end of the haystack or where the previous |
620 | /// character is a word character and the following character is not a word |
621 | /// character. |
622 | WordEnd = 1 << 9, |
623 | /// Match the start half of an ASCII-only word boundary. That is, this |
624 | /// matches a position at either the beginning of the haystack or where the |
625 | /// previous character is not a word character. |
626 | WordStartHalf = 1 << 10, |
627 | /// Match the end half of an ASCII-only word boundary. That is, this |
628 | /// matches a position at either the end of the haystack or where the |
629 | /// following character is not a word character. |
630 | WordEndHalf = 1 << 11, |
631 | } |
632 | |
633 | impl Look { |
634 | /// Returns true if the given position in the given haystack matches this |
635 | /// look-around assertion. |
636 | pub(crate) fn is_match(&self, haystack: &[u8], at: usize) -> bool { |
637 | use self::Look::*; |
638 | |
639 | match *self { |
640 | Start => at == 0, |
641 | End => at == haystack.len(), |
642 | StartLF => at == 0 || haystack[at - 1] == b' \n' , |
643 | EndLF => at == haystack.len() || haystack[at] == b' \n' , |
644 | StartCRLF => { |
645 | at == 0 |
646 | || haystack[at - 1] == b' \n' |
647 | || (haystack[at - 1] == b' \r' |
648 | && (at >= haystack.len() || haystack[at] != b' \n' )) |
649 | } |
650 | EndCRLF => { |
651 | at == haystack.len() |
652 | || haystack[at] == b' \r' |
653 | || (haystack[at] == b' \n' |
654 | && (at == 0 || haystack[at - 1] != b' \r' )) |
655 | } |
656 | Word => { |
657 | let word_before = |
658 | at > 0 && utf8::is_word_byte(haystack[at - 1]); |
659 | let word_after = |
660 | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
661 | word_before != word_after |
662 | } |
663 | WordNegate => { |
664 | let word_before = |
665 | at > 0 && utf8::is_word_byte(haystack[at - 1]); |
666 | let word_after = |
667 | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
668 | word_before == word_after |
669 | } |
670 | WordStart => { |
671 | let word_before = |
672 | at > 0 && utf8::is_word_byte(haystack[at - 1]); |
673 | let word_after = |
674 | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
675 | !word_before && word_after |
676 | } |
677 | WordEnd => { |
678 | let word_before = |
679 | at > 0 && utf8::is_word_byte(haystack[at - 1]); |
680 | let word_after = |
681 | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
682 | word_before && !word_after |
683 | } |
684 | WordStartHalf => { |
685 | let word_before = |
686 | at > 0 && utf8::is_word_byte(haystack[at - 1]); |
687 | !word_before |
688 | } |
689 | WordEndHalf => { |
690 | let word_after = |
691 | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
692 | !word_after |
693 | } |
694 | } |
695 | } |
696 | } |
697 | |
698 | /// The high-level intermediate representation of a repetition operator. |
699 | /// |
700 | /// A repetition operator permits the repetition of an arbitrary |
701 | /// sub-expression. |
702 | #[derive (Clone, Debug, Eq, PartialEq)] |
703 | pub(crate) struct Repetition { |
704 | /// The minimum range of the repetition. |
705 | /// |
706 | /// Note that special cases like `?`, `+` and `*` all get translated into |
707 | /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively. |
708 | /// |
709 | /// When `min` is zero, this expression can match the empty string |
710 | /// regardless of what its sub-expression is. |
711 | pub(crate) min: u32, |
712 | /// The maximum range of the repetition. |
713 | /// |
714 | /// Note that when `max` is `None`, `min` acts as a lower bound but where |
715 | /// there is no upper bound. For something like `x{5}` where the min and |
716 | /// max are equivalent, `min` will be set to `5` and `max` will be set to |
717 | /// `Some(5)`. |
718 | pub(crate) max: Option<u32>, |
719 | /// Whether this repetition operator is greedy or not. A greedy operator |
720 | /// will match as much as it can. A non-greedy operator will match as |
721 | /// little as it can. |
722 | /// |
723 | /// Typically, operators are greedy by default and are only non-greedy when |
724 | /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is |
725 | /// not. However, this can be inverted via the `U` "ungreedy" flag. |
726 | pub(crate) greedy: bool, |
727 | /// The expression being repeated. |
728 | pub(crate) sub: Box<Hir>, |
729 | } |
730 | |
731 | /// The high-level intermediate representation for a capturing group. |
732 | /// |
733 | /// A capturing group always has an index and a child expression. It may |
734 | /// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not |
735 | /// necessary. |
736 | /// |
737 | /// Note that there is no explicit representation of a non-capturing group |
738 | /// in a `Hir`. Instead, non-capturing grouping is handled automatically by |
739 | /// the recursive structure of the `Hir` itself. |
740 | #[derive (Clone, Debug, Eq, PartialEq)] |
741 | pub(crate) struct Capture { |
742 | /// The capture index of the capture. |
743 | pub(crate) index: u32, |
744 | /// The name of the capture, if it exists. |
745 | pub(crate) name: Option<Box<str>>, |
746 | /// The expression inside the capturing group, which may be empty. |
747 | pub(crate) sub: Box<Hir>, |
748 | } |
749 | |
750 | fn next_char(ch: char) -> Option<char> { |
751 | // Skip over the surrogate range. |
752 | if ch == ' \u{D7FF}' { |
753 | return Some(' \u{E000}' ); |
754 | } |
755 | // OK because char::MAX < u32::MAX and we handle U+D7FF above. |
756 | char::from_u32(u32::from(ch).checked_add(1).unwrap()) |
757 | } |
758 | |
759 | fn prev_char(ch: char) -> Option<char> { |
760 | // Skip over the surrogate range. |
761 | if ch == ' \u{E000}' { |
762 | return Some(' \u{D7FF}' ); |
763 | } |
764 | // OK because subtracting 1 from any valid scalar value other than 0 |
765 | // and U+E000 yields a valid scalar value. |
766 | Some(char::from_u32(u32::from(ch).checked_sub(1)?).unwrap()) |
767 | } |
768 | |
769 | impl Drop for Hir { |
770 | fn drop(&mut self) { |
771 | use core::mem; |
772 | |
773 | match *self.kind() { |
774 | HirKind::Empty |
775 | | HirKind::Char(_) |
776 | | HirKind::Class(_) |
777 | | HirKind::Look(_) => return, |
778 | HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return, |
779 | HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => { |
780 | return |
781 | } |
782 | HirKind::Concat(ref x) if x.is_empty() => return, |
783 | HirKind::Alternation(ref x) if x.is_empty() => return, |
784 | _ => {} |
785 | } |
786 | |
787 | let mut stack = vec![mem::replace(self, Hir::empty())]; |
788 | while let Some(mut expr) = stack.pop() { |
789 | match expr.kind { |
790 | HirKind::Empty |
791 | | HirKind::Char(_) |
792 | | HirKind::Class(_) |
793 | | HirKind::Look(_) => {} |
794 | HirKind::Capture(ref mut x) => { |
795 | stack.push(mem::replace(&mut x.sub, Hir::empty())); |
796 | } |
797 | HirKind::Repetition(ref mut x) => { |
798 | stack.push(mem::replace(&mut x.sub, Hir::empty())); |
799 | } |
800 | HirKind::Concat(ref mut x) => { |
801 | stack.extend(x.drain(..)); |
802 | } |
803 | HirKind::Alternation(ref mut x) => { |
804 | stack.extend(x.drain(..)); |
805 | } |
806 | } |
807 | } |
808 | } |
809 | } |
810 | |