1 | use core::cell::{Cell, RefCell}; |
2 | |
3 | use alloc::{ |
4 | boxed::Box, |
5 | string::{String, ToString}, |
6 | vec, |
7 | vec::Vec, |
8 | }; |
9 | |
10 | use crate::{ |
11 | error::Error, |
12 | hir::{self, Config, Flags, Hir, HirKind}, |
13 | }; |
14 | |
15 | // These are all of the errors that can occur while parsing a regex. Unlike |
16 | // regex-syntax, our errors are not particularly great. They are just enough |
17 | // to get a general sense of what went wrong. But in exchange, the error |
18 | // reporting mechanism is *much* simpler than what's in regex-syntax. |
19 | // |
20 | // By convention, we use each of these messages in exactly one place. That |
21 | // way, every branch that leads to an error has a unique message. This in turn |
22 | // means that given a message, one can precisely identify which part of the |
23 | // parser reported it. |
24 | // |
25 | // Finally, we give names to each message so that we can reference them in |
26 | // tests. |
27 | const ERR_TOO_MUCH_NESTING: &str = "pattern has too much nesting" ; |
28 | const ERR_TOO_MANY_CAPTURES: &str = "too many capture groups" ; |
29 | const ERR_DUPLICATE_CAPTURE_NAME: &str = "duplicate capture group name" ; |
30 | const ERR_UNCLOSED_GROUP: &str = "found open group without closing ')'" ; |
31 | const ERR_UNCLOSED_GROUP_QUESTION: &str = |
32 | "expected closing ')', but got end of pattern" ; |
33 | const ERR_UNOPENED_GROUP: &str = "found closing ')' without matching '('" ; |
34 | const ERR_LOOK_UNSUPPORTED: &str = "look-around is not supported" ; |
35 | const ERR_EMPTY_FLAGS: &str = "empty flag directive '(?)' is not allowed" ; |
36 | const ERR_MISSING_GROUP_NAME: &str = |
37 | "expected capture group name, but got end of pattern" ; |
38 | const ERR_INVALID_GROUP_NAME: &str = "invalid group name" ; |
39 | const ERR_UNCLOSED_GROUP_NAME: &str = |
40 | "expected end of capture group name, but got end of pattern" ; |
41 | const ERR_EMPTY_GROUP_NAME: &str = "empty capture group names are not allowed" ; |
42 | const ERR_FLAG_UNRECOGNIZED: &str = "unrecognized inline flag" ; |
43 | const ERR_FLAG_REPEATED_NEGATION: &str = |
44 | "inline flag negation cannot be repeated" ; |
45 | const ERR_FLAG_DUPLICATE: &str = "duplicate inline flag is not allowed" ; |
46 | const ERR_FLAG_UNEXPECTED_EOF: &str = |
47 | "expected ':' or ')' to end inline flags, but got end of pattern" ; |
48 | const ERR_FLAG_DANGLING_NEGATION: &str = |
49 | "inline flags cannot end with negation directive" ; |
50 | const ERR_DECIMAL_NO_DIGITS: &str = |
51 | "expected decimal number, but found no digits" ; |
52 | const ERR_DECIMAL_INVALID: &str = "got invalid decimal number" ; |
53 | const ERR_HEX_BRACE_INVALID_DIGIT: &str = |
54 | "expected hexadecimal number in braces, but got non-hex digit" ; |
55 | const ERR_HEX_BRACE_UNEXPECTED_EOF: &str = |
56 | "expected hexadecimal number, but saw end of pattern before closing brace" ; |
57 | const ERR_HEX_BRACE_EMPTY: &str = |
58 | "expected hexadecimal number in braces, but got no digits" ; |
59 | const ERR_HEX_BRACE_INVALID: &str = "got invalid hexadecimal number in braces" ; |
60 | const ERR_HEX_FIXED_UNEXPECTED_EOF: &str = |
61 | "expected fixed length hexadecimal number, but saw end of pattern first" ; |
62 | const ERR_HEX_FIXED_INVALID_DIGIT: &str = |
63 | "expected fixed length hexadecimal number, but got non-hex digit" ; |
64 | const ERR_HEX_FIXED_INVALID: &str = |
65 | "got invalid fixed length hexadecimal number" ; |
66 | const ERR_HEX_UNEXPECTED_EOF: &str = |
67 | "expected hexadecimal number, but saw end of pattern first" ; |
68 | const ERR_ESCAPE_UNEXPECTED_EOF: &str = |
69 | "saw start of escape sequence, but saw end of pattern before it finished" ; |
70 | const ERR_BACKREF_UNSUPPORTED: &str = "backreferences are not supported" ; |
71 | const ERR_UNICODE_CLASS_UNSUPPORTED: &str = |
72 | "Unicode character classes are not supported" ; |
73 | const ERR_ESCAPE_UNRECOGNIZED: &str = "unrecognized escape sequence" ; |
74 | const ERR_POSIX_CLASS_UNRECOGNIZED: &str = |
75 | "unrecognized POSIX character class" ; |
76 | const ERR_UNCOUNTED_REP_SUB_MISSING: &str = |
77 | "uncounted repetition operator must be applied to a sub-expression" ; |
78 | const ERR_COUNTED_REP_SUB_MISSING: &str = |
79 | "counted repetition operator must be applied to a sub-expression" ; |
80 | const ERR_COUNTED_REP_UNCLOSED: &str = |
81 | "found unclosed counted repetition operator" ; |
82 | const ERR_COUNTED_REP_MIN_UNCLOSED: &str = |
83 | "found incomplete and unclosed counted repetition operator" ; |
84 | const ERR_COUNTED_REP_COMMA_UNCLOSED: &str = |
85 | "found counted repetition operator with a comma that is unclosed" ; |
86 | const ERR_COUNTED_REP_MIN_MAX_UNCLOSED: &str = |
87 | "found counted repetition with min and max that is unclosed" ; |
88 | const ERR_COUNTED_REP_INVALID: &str = |
89 | "expected closing brace for counted repetition, but got something else" ; |
90 | const ERR_COUNTED_REP_INVALID_RANGE: &str = |
91 | "found counted repetition with a min bigger than its max" ; |
92 | const ERR_CLASS_UNCLOSED_AFTER_ITEM: &str = |
93 | "non-empty character class has no closing bracket" ; |
94 | const ERR_CLASS_INVALID_RANGE_ITEM: &str = |
95 | "character class ranges must start and end with a single character" ; |
96 | const ERR_CLASS_INVALID_ITEM: &str = |
97 | "invalid escape sequence in character class" ; |
98 | const ERR_CLASS_UNCLOSED_AFTER_DASH: &str = |
99 | "non-empty character class has no closing bracket after dash" ; |
100 | const ERR_CLASS_UNCLOSED_AFTER_NEGATION: &str = |
101 | "negated character class has no closing bracket" ; |
102 | const ERR_CLASS_UNCLOSED_AFTER_CLOSING: &str = |
103 | "character class begins with literal ']' but has no closing bracket" ; |
104 | const ERR_CLASS_INVALID_RANGE: &str = "invalid range in character class" ; |
105 | const ERR_CLASS_UNCLOSED: &str = "found unclosed character class" ; |
106 | const ERR_CLASS_NEST_UNSUPPORTED: &str = |
107 | "nested character classes are not supported" ; |
108 | const ERR_CLASS_INTERSECTION_UNSUPPORTED: &str = |
109 | "character class intersection is not supported" ; |
110 | const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str = |
111 | "character class difference is not supported" ; |
112 | const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str = |
113 | "character class symmetric difference is not supported" ; |
114 | const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED: &str = |
115 | "special word boundary assertion is unclosed or has an invalid character" ; |
116 | const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED: &str = |
117 | "special word boundary assertion is unrecognized" ; |
118 | const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF: &str = |
119 | "found start of special word boundary or repetition without an end" ; |
120 | |
121 | /// A regular expression parser. |
122 | /// |
123 | /// This parses a string representation of a regular expression into an |
124 | /// abstract syntax tree. The size of the tree is proportional to the length |
125 | /// of the regular expression pattern. |
126 | /// |
127 | /// A `Parser` can be configured in more detail via a [`ParserBuilder`]. |
128 | #[derive (Clone, Debug)] |
129 | pub(super) struct Parser<'a> { |
130 | /// The configuration of the parser as given by the caller. |
131 | config: Config, |
132 | /// The pattern we're parsing as given by the caller. |
133 | pattern: &'a str, |
134 | /// The call depth of the parser. This is incremented for each |
135 | /// sub-expression parsed. Its peak value is the maximum nesting of the |
136 | /// pattern. |
137 | depth: Cell<u32>, |
138 | /// The current position of the parser. |
139 | pos: Cell<usize>, |
140 | /// The current codepoint of the parser. The codepoint corresponds to the |
141 | /// codepoint encoded in `pattern` beginning at `pos`. |
142 | /// |
143 | /// This is `None` if and only if `pos == pattern.len()`. |
144 | char: Cell<Option<char>>, |
145 | /// The current capture index. |
146 | capture_index: Cell<u32>, |
147 | /// The flags that are currently set. |
148 | flags: RefCell<Flags>, |
149 | /// A sorted sequence of capture names. This is used to detect duplicate |
150 | /// capture names and report an error if one is detected. |
151 | capture_names: RefCell<Vec<String>>, |
152 | } |
153 | |
154 | /// The constructor and a variety of helper routines. |
155 | impl<'a> Parser<'a> { |
156 | /// Build a parser from this configuration with the given pattern. |
157 | pub(super) fn new(config: Config, pattern: &'a str) -> Parser<'a> { |
158 | Parser { |
159 | config, |
160 | pattern, |
161 | depth: Cell::new(0), |
162 | pos: Cell::new(0), |
163 | char: Cell::new(pattern.chars().next()), |
164 | capture_index: Cell::new(0), |
165 | flags: RefCell::new(config.flags), |
166 | capture_names: RefCell::new(vec![]), |
167 | } |
168 | } |
169 | |
170 | /// Returns the full pattern string that we're parsing. |
171 | fn pattern(&self) -> &str { |
172 | self.pattern |
173 | } |
174 | |
175 | /// Return the current byte offset of the parser. |
176 | /// |
177 | /// The offset starts at `0` from the beginning of the regular expression |
178 | /// pattern string. |
179 | fn pos(&self) -> usize { |
180 | self.pos.get() |
181 | } |
182 | |
183 | /// Increments the call depth of the parser. |
184 | /// |
185 | /// If the call depth would exceed the configured nest limit, then this |
186 | /// returns an error. |
187 | /// |
188 | /// This returns the old depth. |
189 | fn increment_depth(&self) -> Result<u32, Error> { |
190 | let old = self.depth.get(); |
191 | if old > self.config.nest_limit { |
192 | return Err(Error::new(ERR_TOO_MUCH_NESTING)); |
193 | } |
194 | // OK because our depth starts at 0, and we return an error if it |
195 | // ever reaches the limit. So the call depth can never exceed u32::MAX. |
196 | let new = old.checked_add(1).unwrap(); |
197 | self.depth.set(new); |
198 | Ok(old) |
199 | } |
200 | |
201 | /// Decrements the call depth of the parser. |
202 | /// |
203 | /// This panics if the current depth is 0. |
204 | fn decrement_depth(&self) { |
205 | let old = self.depth.get(); |
206 | // If this fails then the caller has a bug in how they're incrementing |
207 | // and decrementing the depth of the parser's call stack. |
208 | let new = old.checked_sub(1).unwrap(); |
209 | self.depth.set(new); |
210 | } |
211 | |
212 | /// Return the codepoint at the current position of the parser. |
213 | /// |
214 | /// This panics if the parser is positioned at the end of the pattern. |
215 | fn char(&self) -> char { |
216 | self.char.get().expect("codepoint, but parser is done" ) |
217 | } |
218 | |
219 | /// Returns true if the next call to `bump` would return false. |
220 | fn is_done(&self) -> bool { |
221 | self.pos() == self.pattern.len() |
222 | } |
223 | |
224 | /// Returns the flags that are current set for this regex. |
225 | fn flags(&self) -> Flags { |
226 | *self.flags.borrow() |
227 | } |
228 | |
229 | /// Bump the parser to the next Unicode scalar value. |
230 | /// |
231 | /// If the end of the input has been reached, then `false` is returned. |
232 | fn bump(&self) -> bool { |
233 | if self.is_done() { |
234 | return false; |
235 | } |
236 | self.pos.set(self.pos() + self.char().len_utf8()); |
237 | self.char.set(self.pattern()[self.pos()..].chars().next()); |
238 | self.char.get().is_some() |
239 | } |
240 | |
241 | /// If the substring starting at the current position of the parser has |
242 | /// the given prefix, then bump the parser to the character immediately |
243 | /// following the prefix and return true. Otherwise, don't bump the parser |
244 | /// and return false. |
245 | fn bump_if(&self, prefix: &str) -> bool { |
246 | if self.pattern()[self.pos()..].starts_with(prefix) { |
247 | for _ in 0..prefix.chars().count() { |
248 | self.bump(); |
249 | } |
250 | true |
251 | } else { |
252 | false |
253 | } |
254 | } |
255 | |
256 | /// Bump the parser, and if the `x` flag is enabled, bump through any |
257 | /// subsequent spaces. Return true if and only if the parser is not done. |
258 | fn bump_and_bump_space(&self) -> bool { |
259 | if !self.bump() { |
260 | return false; |
261 | } |
262 | self.bump_space(); |
263 | !self.is_done() |
264 | } |
265 | |
266 | /// If the `x` flag is enabled (i.e., whitespace insensitivity with |
267 | /// comments), then this will advance the parser through all whitespace |
268 | /// and comments to the next non-whitespace non-comment byte. |
269 | /// |
270 | /// If the `x` flag is disabled, then this is a no-op. |
271 | /// |
272 | /// This should be used selectively throughout the parser where |
273 | /// arbitrary whitespace is permitted when the `x` flag is enabled. For |
274 | /// example, `{ 5 , 6}` is equivalent to `{5,6}`. |
275 | fn bump_space(&self) { |
276 | if !self.flags().ignore_whitespace { |
277 | return; |
278 | } |
279 | while !self.is_done() { |
280 | if self.char().is_whitespace() { |
281 | self.bump(); |
282 | } else if self.char() == '#' { |
283 | self.bump(); |
284 | while !self.is_done() { |
285 | let c = self.char(); |
286 | self.bump(); |
287 | if c == ' \n' { |
288 | break; |
289 | } |
290 | } |
291 | } else { |
292 | break; |
293 | } |
294 | } |
295 | } |
296 | |
297 | /// Peek at the next character in the input without advancing the parser. |
298 | /// |
299 | /// If the input has been exhausted, then this returns `None`. |
300 | fn peek(&self) -> Option<char> { |
301 | if self.is_done() { |
302 | return None; |
303 | } |
304 | self.pattern()[self.pos() + self.char().len_utf8()..].chars().next() |
305 | } |
306 | |
307 | /// Peeks at the next character in the pattern from the current offset, and |
308 | /// will ignore spaces when the parser is in whitespace insensitive mode. |
309 | fn peek_space(&self) -> Option<char> { |
310 | if !self.flags().ignore_whitespace { |
311 | return self.peek(); |
312 | } |
313 | if self.is_done() { |
314 | return None; |
315 | } |
316 | let mut start = self.pos() + self.char().len_utf8(); |
317 | let mut in_comment = false; |
318 | for (i, ch) in self.pattern()[start..].char_indices() { |
319 | if ch.is_whitespace() { |
320 | continue; |
321 | } else if !in_comment && ch == '#' { |
322 | in_comment = true; |
323 | } else if in_comment && ch == ' \n' { |
324 | in_comment = false; |
325 | } else { |
326 | start += i; |
327 | break; |
328 | } |
329 | } |
330 | self.pattern()[start..].chars().next() |
331 | } |
332 | |
333 | /// Return the next capturing index. Each subsequent call increments the |
334 | /// internal index. Since the way capture indices are computed is a public |
335 | /// API guarantee, use of this routine depends on the parser being depth |
336 | /// first and left-to-right. |
337 | /// |
338 | /// If the capture limit is exceeded, then an error is returned. |
339 | fn next_capture_index(&self) -> Result<u32, Error> { |
340 | let current = self.capture_index.get(); |
341 | let next = current |
342 | .checked_add(1) |
343 | .ok_or_else(|| Error::new(ERR_TOO_MANY_CAPTURES))?; |
344 | self.capture_index.set(next); |
345 | Ok(next) |
346 | } |
347 | |
348 | /// Adds the given capture name to this parser. If this capture name has |
349 | /// already been used, then an error is returned. |
350 | fn add_capture_name(&self, name: &str) -> Result<(), Error> { |
351 | let mut names = self.capture_names.borrow_mut(); |
352 | match names.binary_search_by(|n| name.cmp(n)) { |
353 | Ok(_) => Err(Error::new(ERR_DUPLICATE_CAPTURE_NAME)), |
354 | Err(i) => { |
355 | names.insert(i, name.to_string()); |
356 | Ok(()) |
357 | } |
358 | } |
359 | } |
360 | |
361 | /// Returns true if and only if the parser is positioned at a look-around |
362 | /// prefix. The conditions under which this returns true must always |
363 | /// correspond to a regular expression that would otherwise be consider |
364 | /// invalid. |
365 | /// |
366 | /// This should only be called immediately after parsing the opening of |
367 | /// a group or a set of flags. |
368 | fn is_lookaround_prefix(&self) -> bool { |
369 | self.bump_if("?=" ) |
370 | || self.bump_if("?!" ) |
371 | || self.bump_if("?<=" ) |
372 | || self.bump_if("?<!" ) |
373 | } |
374 | } |
375 | |
376 | /// The actual parser. We try to break out each kind of regex syntax into its |
377 | /// own routine. |
378 | impl<'a> Parser<'a> { |
379 | pub(super) fn parse(&self) -> Result<Hir, Error> { |
380 | let hir = self.parse_inner()?; |
381 | // While we also check nesting during parsing, that only checks the |
382 | // number of recursive parse calls. It does not necessarily cover |
383 | // all possible recursive nestings of the Hir itself. For example, |
384 | // repetition operators don't require recursive parse calls. So one |
385 | // can stack them arbitrarily without overflowing the stack in the |
386 | // *parser*. But then if one recurses over the resulting Hir, a stack |
387 | // overflow is possible. So here we check the Hir nesting level |
388 | // thoroughly to ensure it isn't nested too deeply. |
389 | // |
390 | // Note that we do still need the nesting limit check in the parser as |
391 | // well, since that will avoid overflowing the stack during parse time |
392 | // before the complete Hir value is constructed. |
393 | check_hir_nesting(&hir, self.config.nest_limit)?; |
394 | Ok(hir) |
395 | } |
396 | |
397 | fn parse_inner(&self) -> Result<Hir, Error> { |
398 | let depth = self.increment_depth()?; |
399 | let mut alternates = vec![]; |
400 | let mut concat = vec![]; |
401 | loop { |
402 | self.bump_space(); |
403 | if self.is_done() { |
404 | break; |
405 | } |
406 | match self.char() { |
407 | '(' => { |
408 | // Save the old flags and reset them only when we close |
409 | // the group. |
410 | let oldflags = *self.flags.borrow(); |
411 | if let Some(sub) = self.parse_group()? { |
412 | concat.push(sub); |
413 | // We only reset them here because if 'parse_group' |
414 | // returns None, then that means it handled a flag |
415 | // directive, e.g., '(?ism)'. And the whole point is |
416 | // that those flags remain active until either disabled |
417 | // or the end of the pattern or current group. |
418 | *self.flags.borrow_mut() = oldflags; |
419 | } |
420 | if self.char.get() != Some(')' ) { |
421 | return Err(Error::new(ERR_UNCLOSED_GROUP)); |
422 | } |
423 | self.bump(); |
424 | } |
425 | ')' => { |
426 | if depth == 0 { |
427 | return Err(Error::new(ERR_UNOPENED_GROUP)); |
428 | } |
429 | break; |
430 | } |
431 | '|' => { |
432 | alternates.push(Hir::concat(core::mem::take(&mut concat))); |
433 | self.bump(); |
434 | } |
435 | '[' => concat.push(self.parse_class()?), |
436 | '?' | '*' | '+' => { |
437 | concat = self.parse_uncounted_repetition(concat)?; |
438 | } |
439 | '{' => { |
440 | concat = self.parse_counted_repetition(concat)?; |
441 | } |
442 | _ => concat.push(self.parse_primitive()?), |
443 | } |
444 | } |
445 | self.decrement_depth(); |
446 | alternates.push(Hir::concat(concat)); |
447 | // N.B. This strips off the "alternation" if there's only one branch. |
448 | Ok(Hir::alternation(alternates)) |
449 | } |
450 | |
451 | /// Parses a "primitive" pattern. A primitive is any expression that does |
452 | /// not contain any sub-expressions. |
453 | /// |
454 | /// This assumes the parser is pointing at the beginning of the primitive. |
455 | fn parse_primitive(&self) -> Result<Hir, Error> { |
456 | let ch = self.char(); |
457 | self.bump(); |
458 | match ch { |
459 | ' \\' => self.parse_escape(), |
460 | '.' => Ok(self.hir_dot()), |
461 | '^' => Ok(self.hir_anchor_start()), |
462 | '$' => Ok(self.hir_anchor_end()), |
463 | ch => Ok(self.hir_char(ch)), |
464 | } |
465 | } |
466 | |
467 | /// Parse an escape sequence. This always results in a "primitive" HIR, |
468 | /// that is, an HIR with no sub-expressions. |
469 | /// |
470 | /// This assumes the parser is positioned at the start of the sequence, |
471 | /// immediately *after* the `\`. It advances the parser to the first |
472 | /// position immediately following the escape sequence. |
473 | fn parse_escape(&self) -> Result<Hir, Error> { |
474 | if self.is_done() { |
475 | return Err(Error::new(ERR_ESCAPE_UNEXPECTED_EOF)); |
476 | } |
477 | let ch = self.char(); |
478 | // Put some of the more complicated routines into helpers. |
479 | match ch { |
480 | '0' ..='9' => return Err(Error::new(ERR_BACKREF_UNSUPPORTED)), |
481 | 'p' | 'P' => { |
482 | return Err(Error::new(ERR_UNICODE_CLASS_UNSUPPORTED)) |
483 | } |
484 | 'x' | 'u' | 'U' => return self.parse_hex(), |
485 | 'd' | 's' | 'w' | 'D' | 'S' | 'W' => { |
486 | return Ok(self.parse_perl_class()); |
487 | } |
488 | _ => {} |
489 | } |
490 | |
491 | // Handle all of the one letter sequences inline. |
492 | self.bump(); |
493 | if hir::is_meta_character(ch) || hir::is_escapeable_character(ch) { |
494 | return Ok(self.hir_char(ch)); |
495 | } |
496 | let special = |ch| Ok(self.hir_char(ch)); |
497 | match ch { |
498 | 'a' => special(' \x07' ), |
499 | 'f' => special(' \x0C' ), |
500 | 't' => special(' \t' ), |
501 | 'n' => special(' \n' ), |
502 | 'r' => special(' \r' ), |
503 | 'v' => special(' \x0B' ), |
504 | 'A' => Ok(Hir::look(hir::Look::Start)), |
505 | 'z' => Ok(Hir::look(hir::Look::End)), |
506 | 'b' => { |
507 | let mut hir = Hir::look(hir::Look::Word); |
508 | if !self.is_done() && self.char() == '{' { |
509 | if let Some(special) = |
510 | self.maybe_parse_special_word_boundary()? |
511 | { |
512 | hir = special; |
513 | } |
514 | } |
515 | Ok(hir) |
516 | } |
517 | 'B' => Ok(Hir::look(hir::Look::WordNegate)), |
518 | '<' => Ok(Hir::look(hir::Look::WordStart)), |
519 | '>' => Ok(Hir::look(hir::Look::WordEnd)), |
520 | _ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)), |
521 | } |
522 | } |
523 | |
524 | /// Attempt to parse a specialty word boundary. That is, `\b{start}`, |
525 | /// `\b{end}`, `\b{start-half}` or `\b{end-half}`. |
526 | /// |
527 | /// This is similar to `maybe_parse_ascii_class` in that, in most cases, |
528 | /// if it fails it will just return `None` with no error. This is done |
529 | /// because `\b{5}` is a valid expression and we want to let that be parsed |
530 | /// by the existing counted repetition parsing code. (I thought about just |
531 | /// invoking the counted repetition code from here, but it seemed a little |
532 | /// ham-fisted.) |
533 | /// |
534 | /// Unlike `maybe_parse_ascii_class` though, this can return an error. |
535 | /// Namely, if we definitely know it isn't a counted repetition, then we |
536 | /// return an error specific to the specialty word boundaries. |
537 | /// |
538 | /// This assumes the parser is positioned at a `{` immediately following |
539 | /// a `\b`. When `None` is returned, the parser is returned to the position |
540 | /// at which it started: pointing at a `{`. |
541 | /// |
542 | /// The position given should correspond to the start of the `\b`. |
543 | fn maybe_parse_special_word_boundary(&self) -> Result<Option<Hir>, Error> { |
544 | assert_eq!(self.char(), '{' ); |
545 | |
546 | let is_valid_char = |c| match c { |
547 | 'A' ..='Z' | 'a' ..='z' | '-' => true, |
548 | _ => false, |
549 | }; |
550 | let start = self.pos(); |
551 | if !self.bump_and_bump_space() { |
552 | return Err(Error::new(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF)); |
553 | } |
554 | // This is one of the critical bits: if the first non-whitespace |
555 | // character isn't in [-A-Za-z] (i.e., this can't be a special word |
556 | // boundary), then we bail and let the counted repetition parser deal |
557 | // with this. |
558 | if !is_valid_char(self.char()) { |
559 | self.pos.set(start); |
560 | self.char.set(Some('{' )); |
561 | return Ok(None); |
562 | } |
563 | |
564 | // Now collect up our chars until we see a '}'. |
565 | let mut scratch = String::new(); |
566 | while !self.is_done() && is_valid_char(self.char()) { |
567 | scratch.push(self.char()); |
568 | self.bump_and_bump_space(); |
569 | } |
570 | if self.is_done() || self.char() != '}' { |
571 | return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED)); |
572 | } |
573 | self.bump(); |
574 | let kind = match scratch.as_str() { |
575 | "start" => hir::Look::WordStart, |
576 | "end" => hir::Look::WordEnd, |
577 | "start-half" => hir::Look::WordStartHalf, |
578 | "end-half" => hir::Look::WordEndHalf, |
579 | _ => { |
580 | return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED)) |
581 | } |
582 | }; |
583 | Ok(Some(Hir::look(kind))) |
584 | } |
585 | |
586 | /// Parse a hex representation of a Unicode codepoint. This handles both |
587 | /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to |
588 | /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to |
589 | /// the first character immediately following the hexadecimal literal. |
590 | fn parse_hex(&self) -> Result<Hir, Error> { |
591 | let digit_len = match self.char() { |
592 | 'x' => 2, |
593 | 'u' => 4, |
594 | 'U' => 8, |
595 | unk => unreachable!( |
596 | "invalid start of fixed length hexadecimal number {}" , |
597 | unk |
598 | ), |
599 | }; |
600 | if !self.bump_and_bump_space() { |
601 | return Err(Error::new(ERR_HEX_UNEXPECTED_EOF)); |
602 | } |
603 | if self.char() == '{' { |
604 | self.parse_hex_brace() |
605 | } else { |
606 | self.parse_hex_digits(digit_len) |
607 | } |
608 | } |
609 | |
610 | /// Parse an N-digit hex representation of a Unicode codepoint. This |
611 | /// expects the parser to be positioned at the first digit and will advance |
612 | /// the parser to the first character immediately following the escape |
613 | /// sequence. |
614 | /// |
615 | /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`) |
616 | /// or 8 (for `\UNNNNNNNN`). |
617 | fn parse_hex_digits(&self, digit_len: usize) -> Result<Hir, Error> { |
618 | let mut scratch = String::new(); |
619 | for i in 0..digit_len { |
620 | if i > 0 && !self.bump_and_bump_space() { |
621 | return Err(Error::new(ERR_HEX_FIXED_UNEXPECTED_EOF)); |
622 | } |
623 | if !is_hex(self.char()) { |
624 | return Err(Error::new(ERR_HEX_FIXED_INVALID_DIGIT)); |
625 | } |
626 | scratch.push(self.char()); |
627 | } |
628 | // The final bump just moves the parser past the literal, which may |
629 | // be EOF. |
630 | self.bump_and_bump_space(); |
631 | match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) { |
632 | None => Err(Error::new(ERR_HEX_FIXED_INVALID)), |
633 | Some(ch) => Ok(self.hir_char(ch)), |
634 | } |
635 | } |
636 | |
637 | /// Parse a hex representation of any Unicode scalar value. This expects |
638 | /// the parser to be positioned at the opening brace `{` and will advance |
639 | /// the parser to the first character following the closing brace `}`. |
640 | fn parse_hex_brace(&self) -> Result<Hir, Error> { |
641 | let mut scratch = String::new(); |
642 | while self.bump_and_bump_space() && self.char() != '}' { |
643 | if !is_hex(self.char()) { |
644 | return Err(Error::new(ERR_HEX_BRACE_INVALID_DIGIT)); |
645 | } |
646 | scratch.push(self.char()); |
647 | } |
648 | if self.is_done() { |
649 | return Err(Error::new(ERR_HEX_BRACE_UNEXPECTED_EOF)); |
650 | } |
651 | assert_eq!(self.char(), '}' ); |
652 | self.bump_and_bump_space(); |
653 | |
654 | if scratch.is_empty() { |
655 | return Err(Error::new(ERR_HEX_BRACE_EMPTY)); |
656 | } |
657 | match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) { |
658 | None => Err(Error::new(ERR_HEX_BRACE_INVALID)), |
659 | Some(ch) => Ok(self.hir_char(ch)), |
660 | } |
661 | } |
662 | |
663 | /// Parse a decimal number into a u32 while trimming leading and trailing |
664 | /// whitespace. |
665 | /// |
666 | /// This expects the parser to be positioned at the first position where |
667 | /// a decimal digit could occur. This will advance the parser to the byte |
668 | /// immediately following the last contiguous decimal digit. |
669 | /// |
670 | /// If no decimal digit could be found or if there was a problem parsing |
671 | /// the complete set of digits into a u32, then an error is returned. |
672 | fn parse_decimal(&self) -> Result<u32, Error> { |
673 | let mut scratch = String::new(); |
674 | while !self.is_done() && self.char().is_whitespace() { |
675 | self.bump(); |
676 | } |
677 | while !self.is_done() && '0' <= self.char() && self.char() <= '9' { |
678 | scratch.push(self.char()); |
679 | self.bump_and_bump_space(); |
680 | } |
681 | while !self.is_done() && self.char().is_whitespace() { |
682 | self.bump_and_bump_space(); |
683 | } |
684 | let digits = scratch.as_str(); |
685 | if digits.is_empty() { |
686 | return Err(Error::new(ERR_DECIMAL_NO_DIGITS)); |
687 | } |
688 | match u32::from_str_radix(digits, 10).ok() { |
689 | Some(n) => Ok(n), |
690 | None => Err(Error::new(ERR_DECIMAL_INVALID)), |
691 | } |
692 | } |
693 | |
694 | /// Parses an uncounted repetition operator. An uncounted repetition |
695 | /// operator includes `?`, `*` and `+`, but does not include the `{m,n}` |
696 | /// syntax. The current character should be one of `?`, `*` or `+`. Any |
697 | /// other character will result in a panic. |
698 | /// |
699 | /// This assumes that the parser is currently positioned at the repetition |
700 | /// operator and advances the parser to the first character after the |
701 | /// operator. (Note that the operator may include a single additional `?`, |
702 | /// which makes the operator ungreedy.) |
703 | /// |
704 | /// The caller should include the concatenation that is being built. The |
705 | /// concatenation returned includes the repetition operator applied to the |
706 | /// last expression in the given concatenation. |
707 | /// |
708 | /// If the concatenation is empty, then this returns an error. |
709 | fn parse_uncounted_repetition( |
710 | &self, |
711 | mut concat: Vec<Hir>, |
712 | ) -> Result<Vec<Hir>, Error> { |
713 | let sub = match concat.pop() { |
714 | Some(hir) => Box::new(hir), |
715 | None => { |
716 | return Err(Error::new(ERR_UNCOUNTED_REP_SUB_MISSING)); |
717 | } |
718 | }; |
719 | let (min, max) = match self.char() { |
720 | '?' => (0, Some(1)), |
721 | '*' => (0, None), |
722 | '+' => (1, None), |
723 | unk => unreachable!("unrecognized repetition operator ' {}'" , unk), |
724 | }; |
725 | let mut greedy = true; |
726 | if self.bump() && self.char() == '?' { |
727 | greedy = false; |
728 | self.bump(); |
729 | } |
730 | if self.flags().swap_greed { |
731 | greedy = !greedy; |
732 | } |
733 | concat.push(Hir::repetition(hir::Repetition { |
734 | min, |
735 | max, |
736 | greedy, |
737 | sub, |
738 | })); |
739 | Ok(concat) |
740 | } |
741 | |
742 | /// Parses a counted repetition operation. A counted repetition operator |
743 | /// corresponds to the `{m,n}` syntax, and does not include the `?`, `*` or |
744 | /// `+` operators. |
745 | /// |
746 | /// This assumes that the parser is currently at the opening `{` and |
747 | /// advances the parser to the first character after the operator. (Note |
748 | /// that the operator may include a single additional `?`, which makes the |
749 | /// operator ungreedy.) |
750 | /// |
751 | /// The caller should include the concatenation that is being built. The |
752 | /// concatenation returned includes the repetition operator applied to the |
753 | /// last expression in the given concatenation. |
754 | /// |
755 | /// If the concatenation is empty, then this returns an error. |
756 | fn parse_counted_repetition( |
757 | &self, |
758 | mut concat: Vec<Hir>, |
759 | ) -> Result<Vec<Hir>, Error> { |
760 | assert_eq!(self.char(), '{' , "expected opening brace" ); |
761 | let sub = match concat.pop() { |
762 | Some(hir) => Box::new(hir), |
763 | None => { |
764 | return Err(Error::new(ERR_COUNTED_REP_SUB_MISSING)); |
765 | } |
766 | }; |
767 | if !self.bump_and_bump_space() { |
768 | return Err(Error::new(ERR_COUNTED_REP_UNCLOSED)); |
769 | } |
770 | let min = self.parse_decimal()?; |
771 | let mut max = Some(min); |
772 | if self.is_done() { |
773 | return Err(Error::new(ERR_COUNTED_REP_MIN_UNCLOSED)); |
774 | } |
775 | if self.char() == ',' { |
776 | if !self.bump_and_bump_space() { |
777 | return Err(Error::new(ERR_COUNTED_REP_COMMA_UNCLOSED)); |
778 | } |
779 | if self.char() != '}' { |
780 | max = Some(self.parse_decimal()?); |
781 | } else { |
782 | max = None; |
783 | } |
784 | if self.is_done() { |
785 | return Err(Error::new(ERR_COUNTED_REP_MIN_MAX_UNCLOSED)); |
786 | } |
787 | } |
788 | if self.char() != '}' { |
789 | return Err(Error::new(ERR_COUNTED_REP_INVALID)); |
790 | } |
791 | |
792 | let mut greedy = true; |
793 | if self.bump_and_bump_space() && self.char() == '?' { |
794 | greedy = false; |
795 | self.bump(); |
796 | } |
797 | if self.flags().swap_greed { |
798 | greedy = !greedy; |
799 | } |
800 | |
801 | if max.map_or(false, |max| min > max) { |
802 | return Err(Error::new(ERR_COUNTED_REP_INVALID_RANGE)); |
803 | } |
804 | concat.push(Hir::repetition(hir::Repetition { |
805 | min, |
806 | max, |
807 | greedy, |
808 | sub, |
809 | })); |
810 | Ok(concat) |
811 | } |
812 | |
813 | /// Parses the part of a pattern that starts with a `(`. This is usually |
814 | /// a group sub-expression, but might just be a directive that enables |
815 | /// (or disables) certain flags. |
816 | /// |
817 | /// This assumes the parser is pointing at the opening `(`. |
818 | fn parse_group(&self) -> Result<Option<Hir>, Error> { |
819 | assert_eq!(self.char(), '(' ); |
820 | self.bump_and_bump_space(); |
821 | if self.is_lookaround_prefix() { |
822 | return Err(Error::new(ERR_LOOK_UNSUPPORTED)); |
823 | } |
824 | if self.bump_if("?P<" ) || self.bump_if("?<" ) { |
825 | let index = self.next_capture_index()?; |
826 | let name = Some(Box::from(self.parse_capture_name()?)); |
827 | let sub = Box::new(self.parse_inner()?); |
828 | let cap = hir::Capture { index, name, sub }; |
829 | Ok(Some(Hir::capture(cap))) |
830 | } else if self.bump_if("?" ) { |
831 | if self.is_done() { |
832 | return Err(Error::new(ERR_UNCLOSED_GROUP_QUESTION)); |
833 | } |
834 | let start = self.pos(); |
835 | // The flags get reset in the top-level 'parse' routine. |
836 | *self.flags.borrow_mut() = self.parse_flags()?; |
837 | let consumed = self.pos() - start; |
838 | if self.char() == ')' { |
839 | // We don't allow empty flags, e.g., `(?)`. |
840 | if consumed == 0 { |
841 | return Err(Error::new(ERR_EMPTY_FLAGS)); |
842 | } |
843 | Ok(None) |
844 | } else { |
845 | assert_eq!(':' , self.char()); |
846 | self.bump(); |
847 | self.parse_inner().map(Some) |
848 | } |
849 | } else { |
850 | let index = self.next_capture_index()?; |
851 | let sub = Box::new(self.parse_inner()?); |
852 | let cap = hir::Capture { index, name: None, sub }; |
853 | Ok(Some(Hir::capture(cap))) |
854 | } |
855 | } |
856 | |
857 | /// Parses a capture group name. Assumes that the parser is positioned at |
858 | /// the first character in the name following the opening `<` (and may |
859 | /// possibly be EOF). This advances the parser to the first character |
860 | /// following the closing `>`. |
861 | fn parse_capture_name(&self) -> Result<&str, Error> { |
862 | if self.is_done() { |
863 | return Err(Error::new(ERR_MISSING_GROUP_NAME)); |
864 | } |
865 | let start = self.pos(); |
866 | loop { |
867 | if self.char() == '>' { |
868 | break; |
869 | } |
870 | if !is_capture_char(self.char(), self.pos() == start) { |
871 | return Err(Error::new(ERR_INVALID_GROUP_NAME)); |
872 | } |
873 | if !self.bump() { |
874 | break; |
875 | } |
876 | } |
877 | let end = self.pos(); |
878 | if self.is_done() { |
879 | return Err(Error::new(ERR_UNCLOSED_GROUP_NAME)); |
880 | } |
881 | assert_eq!(self.char(), '>' ); |
882 | self.bump(); |
883 | let name = &self.pattern()[start..end]; |
884 | if name.is_empty() { |
885 | return Err(Error::new(ERR_EMPTY_GROUP_NAME)); |
886 | } |
887 | self.add_capture_name(name)?; |
888 | Ok(name) |
889 | } |
890 | |
891 | /// Parse a sequence of flags starting at the current character. |
892 | /// |
893 | /// This advances the parser to the character immediately following the |
894 | /// flags, which is guaranteed to be either `:` or `)`. |
895 | /// |
896 | /// # Errors |
897 | /// |
898 | /// If any flags are duplicated, then an error is returned. |
899 | /// |
900 | /// If the negation operator is used more than once, then an error is |
901 | /// returned. |
902 | /// |
903 | /// If no flags could be found or if the negation operation is not followed |
904 | /// by any flags, then an error is returned. |
905 | fn parse_flags(&self) -> Result<Flags, Error> { |
906 | let mut flags = *self.flags.borrow(); |
907 | let mut negate = false; |
908 | // Keeps track of whether the previous flag item was a '-'. We use this |
909 | // to detect whether there is a dangling '-', which is invalid. |
910 | let mut last_was_negation = false; |
911 | // A set to keep track of the flags we've seen. Since all flags are |
912 | // ASCII, we only need 128 bytes. |
913 | let mut seen = [false; 128]; |
914 | while self.char() != ':' && self.char() != ')' { |
915 | if self.char() == '-' { |
916 | last_was_negation = true; |
917 | if negate { |
918 | return Err(Error::new(ERR_FLAG_REPEATED_NEGATION)); |
919 | } |
920 | negate = true; |
921 | } else { |
922 | last_was_negation = false; |
923 | self.parse_flag(&mut flags, negate)?; |
924 | // OK because every valid flag is ASCII, and we're only here if |
925 | // the flag is valid. |
926 | let flag_byte = u8::try_from(self.char()).unwrap(); |
927 | if seen[usize::from(flag_byte)] { |
928 | return Err(Error::new(ERR_FLAG_DUPLICATE)); |
929 | } |
930 | seen[usize::from(flag_byte)] = true; |
931 | } |
932 | if !self.bump() { |
933 | return Err(Error::new(ERR_FLAG_UNEXPECTED_EOF)); |
934 | } |
935 | } |
936 | if last_was_negation { |
937 | return Err(Error::new(ERR_FLAG_DANGLING_NEGATION)); |
938 | } |
939 | Ok(flags) |
940 | } |
941 | |
942 | /// Parse the current character as a flag. Do not advance the parser. |
943 | /// |
944 | /// This sets the appropriate boolean value in place on the set of flags |
945 | /// given. The boolean is inverted when `negate` is true. |
946 | /// |
947 | /// # Errors |
948 | /// |
949 | /// If the flag is not recognized, then an error is returned. |
950 | fn parse_flag( |
951 | &self, |
952 | flags: &mut Flags, |
953 | negate: bool, |
954 | ) -> Result<(), Error> { |
955 | let enabled = !negate; |
956 | match self.char() { |
957 | 'i' => flags.case_insensitive = enabled, |
958 | 'm' => flags.multi_line = enabled, |
959 | 's' => flags.dot_matches_new_line = enabled, |
960 | 'U' => flags.swap_greed = enabled, |
961 | 'R' => flags.crlf = enabled, |
962 | 'x' => flags.ignore_whitespace = enabled, |
963 | // We make a special exception for this flag where we let it |
964 | // through as a recognized flag, but treat it as a no-op. This in |
965 | // practice retains some compatibility with the regex crate. It is |
966 | // a little suspect to do this, but for example, '(?-u:\b).+' in |
967 | // the regex crate is equivalent to '\b.+' in regex-lite. |
968 | 'u' => {} |
969 | _ => return Err(Error::new(ERR_FLAG_UNRECOGNIZED)), |
970 | } |
971 | Ok(()) |
972 | } |
973 | |
974 | /// Parse a standard character class consisting primarily of characters or |
975 | /// character ranges. |
976 | /// |
977 | /// This assumes the parser is positioned at the opening `[`. If parsing |
978 | /// is successful, then the parser is advanced to the position immediately |
979 | /// following the closing `]`. |
980 | fn parse_class(&self) -> Result<Hir, Error> { |
981 | assert_eq!(self.char(), '[' ); |
982 | |
983 | let mut union = vec![]; |
984 | if !self.bump_and_bump_space() { |
985 | return Err(Error::new(ERR_CLASS_UNCLOSED)); |
986 | } |
987 | // Determine whether the class is negated or not. |
988 | let negate = if self.char() != '^' { |
989 | false |
990 | } else { |
991 | if !self.bump_and_bump_space() { |
992 | return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_NEGATION)); |
993 | } |
994 | true |
995 | }; |
996 | // Accept any number of `-` as literal `-`. |
997 | while self.char() == '-' { |
998 | union.push(hir::ClassRange { start: '-' , end: '-' }); |
999 | if !self.bump_and_bump_space() { |
1000 | return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH)); |
1001 | } |
1002 | } |
1003 | // If `]` is the *first* char in a set, then interpret it as a literal |
1004 | // `]`. That is, an empty class is impossible to write. |
1005 | if union.is_empty() && self.char() == ']' { |
1006 | union.push(hir::ClassRange { start: ']' , end: ']' }); |
1007 | if !self.bump_and_bump_space() { |
1008 | return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_CLOSING)); |
1009 | } |
1010 | } |
1011 | loop { |
1012 | self.bump_space(); |
1013 | if self.is_done() { |
1014 | return Err(Error::new(ERR_CLASS_UNCLOSED)); |
1015 | } |
1016 | match self.char() { |
1017 | '[' => { |
1018 | // Attempt to treat this as the beginning of a POSIX class. |
1019 | // If POSIX class parsing fails, then the parser backs up |
1020 | // to `[`. |
1021 | if let Some(class) = self.maybe_parse_posix_class() { |
1022 | union.extend_from_slice(&class.ranges); |
1023 | continue; |
1024 | } |
1025 | // ... otherwise we don't support nested classes. |
1026 | return Err(Error::new(ERR_CLASS_NEST_UNSUPPORTED)); |
1027 | } |
1028 | ']' => { |
1029 | self.bump(); |
1030 | let mut class = hir::Class::new(union); |
1031 | // Note that we must apply case folding before negation! |
1032 | // Consider `(?i)[^x]`. If we applied negation first, then |
1033 | // the result would be the character class that matched any |
1034 | // Unicode scalar value. |
1035 | if self.flags().case_insensitive { |
1036 | class.ascii_case_fold(); |
1037 | } |
1038 | if negate { |
1039 | class.negate(); |
1040 | } |
1041 | return Ok(Hir::class(class)); |
1042 | } |
1043 | '&' if self.peek() == Some('&' ) => { |
1044 | return Err(Error::new( |
1045 | ERR_CLASS_INTERSECTION_UNSUPPORTED, |
1046 | )); |
1047 | } |
1048 | '-' if self.peek() == Some('-' ) => { |
1049 | return Err(Error::new(ERR_CLASS_DIFFERENCE_UNSUPPORTED)); |
1050 | } |
1051 | '~' if self.peek() == Some('~' ) => { |
1052 | return Err(Error::new( |
1053 | ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, |
1054 | )); |
1055 | } |
1056 | _ => self.parse_class_range(&mut union)?, |
1057 | } |
1058 | } |
1059 | } |
1060 | |
1061 | /// Parse a single primitive item in a character class set. The item to |
1062 | /// be parsed can either be one of a simple literal character, a range |
1063 | /// between two simple literal characters or a "primitive" character |
1064 | /// class like `\w`. |
1065 | /// |
1066 | /// If an invalid escape is found, or if a character class is found where |
1067 | /// a simple literal is expected (e.g., in a range), then an error is |
1068 | /// returned. |
1069 | /// |
1070 | /// Otherwise, the range (or ranges) are appended to the given union of |
1071 | /// ranges. |
1072 | fn parse_class_range( |
1073 | &self, |
1074 | union: &mut Vec<hir::ClassRange>, |
1075 | ) -> Result<(), Error> { |
1076 | let prim1 = self.parse_class_item()?; |
1077 | self.bump_space(); |
1078 | if self.is_done() { |
1079 | return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_ITEM)); |
1080 | } |
1081 | // If the next char isn't a `-`, then we don't have a range. |
1082 | // There are two exceptions. If the char after a `-` is a `]`, then |
1083 | // `-` is interpreted as a literal `-`. Alternatively, if the char |
1084 | // after a `-` is a `-`, then `--` corresponds to a "difference" |
1085 | // operation. (Which we don't support in regex-lite, but error about |
1086 | // specifically in an effort to be loud about differences between the |
1087 | // main regex crate where possible.) |
1088 | if self.char() != '-' |
1089 | || self.peek_space() == Some(']' ) |
1090 | || self.peek_space() == Some('-' ) |
1091 | { |
1092 | union.extend_from_slice(&into_class_item_ranges(prim1)?); |
1093 | return Ok(()); |
1094 | } |
1095 | // OK, now we're parsing a range, so bump past the `-` and parse the |
1096 | // second half of the range. |
1097 | if !self.bump_and_bump_space() { |
1098 | return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH)); |
1099 | } |
1100 | let prim2 = self.parse_class_item()?; |
1101 | let range = hir::ClassRange { |
1102 | start: into_class_item_range(prim1)?, |
1103 | end: into_class_item_range(prim2)?, |
1104 | }; |
1105 | if range.start > range.end { |
1106 | return Err(Error::new(ERR_CLASS_INVALID_RANGE)); |
1107 | } |
1108 | union.push(range); |
1109 | Ok(()) |
1110 | } |
1111 | |
1112 | /// Parse a single item in a character class as a primitive, where the |
1113 | /// primitive either consists of a verbatim literal or a single escape |
1114 | /// sequence. |
1115 | /// |
1116 | /// This assumes the parser is positioned at the beginning of a primitive, |
1117 | /// and advances the parser to the first position after the primitive if |
1118 | /// successful. |
1119 | /// |
1120 | /// Note that it is the caller's responsibility to report an error if an |
1121 | /// illegal primitive was parsed. |
1122 | fn parse_class_item(&self) -> Result<Hir, Error> { |
1123 | let ch = self.char(); |
1124 | self.bump(); |
1125 | if ch == ' \\' { |
1126 | self.parse_escape() |
1127 | } else { |
1128 | Ok(Hir::char(ch)) |
1129 | } |
1130 | } |
1131 | |
1132 | /// Attempt to parse a POSIX character class, e.g., `[:alnum:]`. |
1133 | /// |
1134 | /// This assumes the parser is positioned at the opening `[`. |
1135 | /// |
1136 | /// If no valid POSIX character class could be found, then this does not |
1137 | /// advance the parser and `None` is returned. Otherwise, the parser is |
1138 | /// advanced to the first byte following the closing `]` and the |
1139 | /// corresponding POSIX class is returned. |
1140 | fn maybe_parse_posix_class(&self) -> Option<hir::Class> { |
1141 | // POSIX character classes are interesting from a parsing perspective |
1142 | // because parsing cannot fail with any interesting error. For example, |
1143 | // in order to use an POSIX character class, it must be enclosed in |
1144 | // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think |
1145 | // of it as "POSIX character classes have the syntax `[:NAME:]` which |
1146 | // can only appear within character brackets." This means that things |
1147 | // like `[[:lower:]A]` are legal constructs. |
1148 | // |
1149 | // However, if one types an incorrect POSIX character class, e.g., |
1150 | // `[[:loower:]]`, then we treat that as if it were normal nested |
1151 | // character class containing the characters `:elorw`. (Which isn't |
1152 | // supported and results in an error in regex-lite.) One might argue |
1153 | // that we should return an error instead since the repeated colons |
1154 | // give away the intent to write an POSIX class. But what if the user |
1155 | // typed `[[:lower]]` instead? How can we tell that was intended to be |
1156 | // a POSXI class and not just a normal nested class? |
1157 | // |
1158 | // Reasonable people can probably disagree over this, but for better |
1159 | // or worse, we implement semantics that never fails at the expense of |
1160 | // better failure modes. |
1161 | assert_eq!(self.char(), '[' ); |
1162 | |
1163 | // If parsing fails, then we back up the parser to this starting point. |
1164 | let start_pos = self.pos(); |
1165 | let start_char = self.char.get(); |
1166 | let reset = || { |
1167 | self.pos.set(start_pos); |
1168 | self.char.set(start_char); |
1169 | }; |
1170 | |
1171 | let mut negated = false; |
1172 | if !self.bump() || self.char() != ':' { |
1173 | reset(); |
1174 | return None; |
1175 | } |
1176 | if !self.bump() { |
1177 | reset(); |
1178 | return None; |
1179 | } |
1180 | if self.char() == '^' { |
1181 | negated = true; |
1182 | if !self.bump() { |
1183 | reset(); |
1184 | return None; |
1185 | } |
1186 | } |
1187 | let name_start = self.pos(); |
1188 | while self.char() != ':' && self.bump() {} |
1189 | if self.is_done() { |
1190 | reset(); |
1191 | return None; |
1192 | } |
1193 | let name = &self.pattern()[name_start..self.pos()]; |
1194 | if !self.bump_if(":]" ) { |
1195 | reset(); |
1196 | return None; |
1197 | } |
1198 | if let Ok(ranges) = posix_class(name) { |
1199 | let mut class = hir::Class::new(ranges); |
1200 | if negated { |
1201 | class.negate(); |
1202 | } |
1203 | return Some(class); |
1204 | } |
1205 | reset(); |
1206 | None |
1207 | } |
1208 | |
1209 | /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the |
1210 | /// parser is currently at a valid character class name and will be |
1211 | /// advanced to the character immediately following the class. |
1212 | fn parse_perl_class(&self) -> Hir { |
1213 | let ch = self.char(); |
1214 | self.bump(); |
1215 | let mut class = hir::Class::new(match ch { |
1216 | 'd' | 'D' => posix_class("digit" ).unwrap(), |
1217 | 's' | 'S' => posix_class("space" ).unwrap(), |
1218 | 'w' | 'W' => posix_class("word" ).unwrap(), |
1219 | unk => unreachable!("invalid Perl class \\{}" , unk), |
1220 | }); |
1221 | if ch.is_ascii_uppercase() { |
1222 | class.negate(); |
1223 | } |
1224 | Hir::class(class) |
1225 | } |
1226 | |
1227 | fn hir_dot(&self) -> Hir { |
1228 | if self.flags().dot_matches_new_line { |
1229 | Hir::class(hir::Class::new([hir::ClassRange { |
1230 | start: ' \x00' , |
1231 | end: ' \u{10FFFF}' , |
1232 | }])) |
1233 | } else if self.flags().crlf { |
1234 | Hir::class(hir::Class::new([ |
1235 | hir::ClassRange { start: ' \x00' , end: ' \x09' }, |
1236 | hir::ClassRange { start: ' \x0B' , end: ' \x0C' }, |
1237 | hir::ClassRange { start: ' \x0E' , end: ' \u{10FFFF}' }, |
1238 | ])) |
1239 | } else { |
1240 | Hir::class(hir::Class::new([ |
1241 | hir::ClassRange { start: ' \x00' , end: ' \x09' }, |
1242 | hir::ClassRange { start: ' \x0B' , end: ' \u{10FFFF}' }, |
1243 | ])) |
1244 | } |
1245 | } |
1246 | |
1247 | fn hir_anchor_start(&self) -> Hir { |
1248 | let look = if self.flags().multi_line { |
1249 | if self.flags().crlf { |
1250 | hir::Look::StartCRLF |
1251 | } else { |
1252 | hir::Look::StartLF |
1253 | } |
1254 | } else { |
1255 | hir::Look::Start |
1256 | }; |
1257 | Hir::look(look) |
1258 | } |
1259 | |
1260 | fn hir_anchor_end(&self) -> Hir { |
1261 | let look = if self.flags().multi_line { |
1262 | if self.flags().crlf { |
1263 | hir::Look::EndCRLF |
1264 | } else { |
1265 | hir::Look::EndLF |
1266 | } |
1267 | } else { |
1268 | hir::Look::End |
1269 | }; |
1270 | Hir::look(look) |
1271 | } |
1272 | |
1273 | fn hir_char(&self, ch: char) -> Hir { |
1274 | if self.flags().case_insensitive { |
1275 | let this = hir::ClassRange { start: ch, end: ch }; |
1276 | if let Some(folded) = this.ascii_case_fold() { |
1277 | return Hir::class(hir::Class::new([this, folded])); |
1278 | } |
1279 | } |
1280 | Hir::char(ch) |
1281 | } |
1282 | } |
1283 | |
1284 | /// This checks the depth of the given `Hir` value, and if it exceeds the given |
1285 | /// limit, then an error is returned. |
1286 | fn check_hir_nesting(hir: &Hir, limit: u32) -> Result<(), Error> { |
1287 | fn recurse(hir: &Hir, limit: u32, depth: u32) -> Result<(), Error> { |
1288 | if depth > limit { |
1289 | return Err(Error::new(ERR_TOO_MUCH_NESTING)); |
1290 | } |
1291 | let Some(next_depth) = depth.checked_add(1) else { |
1292 | return Err(Error::new(ERR_TOO_MUCH_NESTING)); |
1293 | }; |
1294 | match *hir.kind() { |
1295 | HirKind::Empty |
1296 | | HirKind::Char(_) |
1297 | | HirKind::Class(_) |
1298 | | HirKind::Look(_) => Ok(()), |
1299 | HirKind::Repetition(hir::Repetition { ref sub, .. }) => { |
1300 | recurse(sub, limit, next_depth) |
1301 | } |
1302 | HirKind::Capture(hir::Capture { ref sub, .. }) => { |
1303 | recurse(sub, limit, next_depth) |
1304 | } |
1305 | HirKind::Concat(ref subs) | HirKind::Alternation(ref subs) => { |
1306 | for sub in subs.iter() { |
1307 | recurse(sub, limit, next_depth)?; |
1308 | } |
1309 | Ok(()) |
1310 | } |
1311 | } |
1312 | } |
1313 | recurse(hir, limit, 0) |
1314 | } |
1315 | |
1316 | /// Converts the given Hir to a literal char if the Hir is just a single |
1317 | /// character. Otherwise this returns an error. |
1318 | /// |
1319 | /// This is useful in contexts where you can only accept a single character, |
1320 | /// but where it is convenient to parse something more general. For example, |
1321 | /// parsing a single part of a character class range. It's useful to reuse |
1322 | /// the literal parsing code, but that code can itself return entire classes |
1323 | /// which can't be used as the start/end of a class range. |
1324 | fn into_class_item_range(hir: Hir) -> Result<char, Error> { |
1325 | match hir.kind { |
1326 | HirKind::Char(ch: char) => Ok(ch), |
1327 | _ => Err(Error::new(ERR_CLASS_INVALID_RANGE_ITEM)), |
1328 | } |
1329 | } |
1330 | |
1331 | fn into_class_item_ranges( |
1332 | mut hir: Hir, |
1333 | ) -> Result<Vec<hir::ClassRange>, Error> { |
1334 | match core::mem::replace(&mut hir.kind, src:HirKind::Empty) { |
1335 | HirKind::Char(ch: char) => Ok(vec![hir::ClassRange { start: ch, end: ch }]), |
1336 | HirKind::Class(hir::Class { ranges: Vec }) => Ok(ranges), |
1337 | _ => Err(Error::new(ERR_CLASS_INVALID_ITEM)), |
1338 | } |
1339 | } |
1340 | |
1341 | /// Returns an iterator of character class ranges for the given named POSIX |
1342 | /// character class. If no such character class exists for the name given, then |
1343 | /// an error is returned. |
1344 | fn posix_class( |
1345 | kind: &str, |
1346 | ) -> Result<impl Iterator<Item = hir::ClassRange>, Error> { |
1347 | let slice: &'static [(u8, u8)] = match kind { |
1348 | "alnum" => &[(b'0' , b'9' ), (b'A' , b'Z' ), (b'a' , b'z' )], |
1349 | "alpha" => &[(b'A' , b'Z' ), (b'a' , b'z' )], |
1350 | "ascii" => &[(b' \x00' , b' \x7F' )], |
1351 | "blank" => &[(b' \t' , b' \t' ), (b' ' , b' ' )], |
1352 | "cntrl" => &[(b' \x00' , b' \x1F' ), (b' \x7F' , b' \x7F' )], |
1353 | "digit" => &[(b'0' , b'9' )], |
1354 | "graph" => &[(b'!' , b'~' )], |
1355 | "lower" => &[(b'a' , b'z' )], |
1356 | "print" => &[(b' ' , b'~' )], |
1357 | "punct" => &[(b'!' , b'/' ), (b':' , b'@' ), (b'[' , b'`' ), (b'{' , b'~' )], |
1358 | "space" => &[ |
1359 | (b' \t' , b' \t' ), |
1360 | (b' \n' , b' \n' ), |
1361 | (b' \x0B' , b' \x0B' ), |
1362 | (b' \x0C' , b' \x0C' ), |
1363 | (b' \r' , b' \r' ), |
1364 | (b' ' , b' ' ), |
1365 | ], |
1366 | "upper" => &[(b'A' , b'Z' )], |
1367 | "word" => &[(b'0' , b'9' ), (b'A' , b'Z' ), (b'_' , b'_' ), (b'a' , b'z' )], |
1368 | "xdigit" => &[(b'0' , b'9' ), (b'A' , b'F' ), (b'a' , b'f' )], |
1369 | _ => return Err(Error::new(ERR_POSIX_CLASS_UNRECOGNIZED)), |
1370 | }; |
1371 | Ok(slice.iter().map(|&(start, end)| hir::ClassRange { |
1372 | start: char::from(start), |
1373 | end: char::from(end), |
1374 | })) |
1375 | } |
1376 | |
1377 | /// Returns true if the given character is a hexadecimal digit. |
1378 | fn is_hex(c: char) -> bool { |
1379 | ('0' <= c && c <= '9' ) || ('a' <= c && c <= 'f' ) || ('A' <= c && c <= 'F' ) |
1380 | } |
1381 | |
1382 | /// Returns true if the given character is a valid in a capture group name. |
1383 | /// |
1384 | /// If `first` is true, then `c` is treated as the first character in the |
1385 | /// group name (which must be alphabetic or underscore). |
1386 | fn is_capture_char(c: char, first: bool) -> bool { |
1387 | if first { |
1388 | c == '_' || c.is_alphabetic() |
1389 | } else { |
1390 | c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric() |
1391 | } |
1392 | } |
1393 | |
1394 | #[cfg (test)] |
1395 | mod tests { |
1396 | use super::*; |
1397 | |
1398 | fn p(pattern: &str) -> Hir { |
1399 | Parser::new(Config::default(), pattern).parse_inner().unwrap() |
1400 | } |
1401 | |
1402 | fn perr(pattern: &str) -> String { |
1403 | Parser::new(Config::default(), pattern) |
1404 | .parse_inner() |
1405 | .unwrap_err() |
1406 | .to_string() |
1407 | } |
1408 | |
1409 | fn class<I: IntoIterator<Item = (char, char)>>(it: I) -> Hir { |
1410 | Hir::class(hir::Class::new( |
1411 | it.into_iter().map(|(start, end)| hir::ClassRange { start, end }), |
1412 | )) |
1413 | } |
1414 | |
1415 | fn singles<I: IntoIterator<Item = char>>(it: I) -> Hir { |
1416 | Hir::class(hir::Class::new( |
1417 | it.into_iter().map(|ch| hir::ClassRange { start: ch, end: ch }), |
1418 | )) |
1419 | } |
1420 | |
1421 | fn posix(name: &str) -> Hir { |
1422 | Hir::class(hir::Class::new(posix_class(name).unwrap())) |
1423 | } |
1424 | |
1425 | fn cap(index: u32, sub: Hir) -> Hir { |
1426 | Hir::capture(hir::Capture { index, name: None, sub: Box::new(sub) }) |
1427 | } |
1428 | |
1429 | fn named_cap(index: u32, name: &str, sub: Hir) -> Hir { |
1430 | Hir::capture(hir::Capture { |
1431 | index, |
1432 | name: Some(Box::from(name)), |
1433 | sub: Box::new(sub), |
1434 | }) |
1435 | } |
1436 | |
1437 | #[test ] |
1438 | fn ok_literal() { |
1439 | assert_eq!(p("a" ), Hir::char('a' )); |
1440 | assert_eq!(p("ab" ), Hir::concat(vec![Hir::char('a' ), Hir::char('b' )])); |
1441 | assert_eq!(p("💩" ), Hir::char('💩' )); |
1442 | } |
1443 | |
1444 | #[test ] |
1445 | fn ok_meta_escapes() { |
1446 | assert_eq!(p(r"\*" ), Hir::char('*' )); |
1447 | assert_eq!(p(r"\+" ), Hir::char('+' )); |
1448 | assert_eq!(p(r"\?" ), Hir::char('?' )); |
1449 | assert_eq!(p(r"\|" ), Hir::char('|' )); |
1450 | assert_eq!(p(r"\(" ), Hir::char('(' )); |
1451 | assert_eq!(p(r"\)" ), Hir::char(')' )); |
1452 | assert_eq!(p(r"\^" ), Hir::char('^' )); |
1453 | assert_eq!(p(r"\$" ), Hir::char('$' )); |
1454 | assert_eq!(p(r"\[" ), Hir::char('[' )); |
1455 | assert_eq!(p(r"\]" ), Hir::char(']' )); |
1456 | } |
1457 | |
1458 | #[test ] |
1459 | fn ok_special_escapes() { |
1460 | assert_eq!(p(r"\a" ), Hir::char(' \x07' )); |
1461 | assert_eq!(p(r"\f" ), Hir::char(' \x0C' )); |
1462 | assert_eq!(p(r"\t" ), Hir::char(' \t' )); |
1463 | assert_eq!(p(r"\n" ), Hir::char(' \n' )); |
1464 | assert_eq!(p(r"\r" ), Hir::char(' \r' )); |
1465 | assert_eq!(p(r"\v" ), Hir::char(' \x0B' )); |
1466 | assert_eq!(p(r"\A" ), Hir::look(hir::Look::Start)); |
1467 | assert_eq!(p(r"\z" ), Hir::look(hir::Look::End)); |
1468 | assert_eq!(p(r"\b" ), Hir::look(hir::Look::Word)); |
1469 | assert_eq!(p(r"\B" ), Hir::look(hir::Look::WordNegate)); |
1470 | } |
1471 | |
1472 | #[test ] |
1473 | fn ok_hex() { |
1474 | // fixed length |
1475 | assert_eq!(p(r"\x41" ), Hir::char('A' )); |
1476 | assert_eq!(p(r"\u2603" ), Hir::char('☃' )); |
1477 | assert_eq!(p(r"\U0001F4A9" ), Hir::char('💩' )); |
1478 | // braces |
1479 | assert_eq!(p(r"\x{1F4A9}" ), Hir::char('💩' )); |
1480 | assert_eq!(p(r"\u{1F4A9}" ), Hir::char('💩' )); |
1481 | assert_eq!(p(r"\U{1F4A9}" ), Hir::char('💩' )); |
1482 | } |
1483 | |
1484 | #[test ] |
1485 | fn ok_perl() { |
1486 | assert_eq!(p(r"\d" ), posix("digit" )); |
1487 | assert_eq!(p(r"\s" ), posix("space" )); |
1488 | assert_eq!(p(r"\w" ), posix("word" )); |
1489 | |
1490 | let negated = |name| { |
1491 | let mut class = hir::Class::new(posix_class(name).unwrap()); |
1492 | class.negate(); |
1493 | Hir::class(class) |
1494 | }; |
1495 | assert_eq!(p(r"\D" ), negated("digit" )); |
1496 | assert_eq!(p(r"\S" ), negated("space" )); |
1497 | assert_eq!(p(r"\W" ), negated("word" )); |
1498 | } |
1499 | |
1500 | #[test ] |
1501 | fn ok_flags_and_primitives() { |
1502 | assert_eq!(p(r"a" ), Hir::char('a' )); |
1503 | assert_eq!(p(r"(?i:a)" ), singles(['A' , 'a' ])); |
1504 | |
1505 | assert_eq!(p(r"^" ), Hir::look(hir::Look::Start)); |
1506 | assert_eq!(p(r"(?m:^)" ), Hir::look(hir::Look::StartLF)); |
1507 | assert_eq!(p(r"(?mR:^)" ), Hir::look(hir::Look::StartCRLF)); |
1508 | |
1509 | assert_eq!(p(r"$" ), Hir::look(hir::Look::End)); |
1510 | assert_eq!(p(r"(?m:$)" ), Hir::look(hir::Look::EndLF)); |
1511 | assert_eq!(p(r"(?mR:$)" ), Hir::look(hir::Look::EndCRLF)); |
1512 | |
1513 | assert_eq!(p(r"." ), class([(' \x00' , ' \x09' ), (' \x0B' , ' \u{10FFFF}' )])); |
1514 | assert_eq!( |
1515 | p(r"(?R:.)" ), |
1516 | class([ |
1517 | (' \x00' , ' \x09' ), |
1518 | (' \x0B' , ' \x0C' ), |
1519 | (' \x0E' , ' \u{10FFFF}' ), |
1520 | ]) |
1521 | ); |
1522 | assert_eq!(p(r"(?s:.)" ), class([(' \x00' , ' \u{10FFFF}' )])); |
1523 | assert_eq!(p(r"(?sR:.)" ), class([(' \x00' , ' \u{10FFFF}' )])); |
1524 | } |
1525 | |
1526 | #[test ] |
1527 | fn ok_alternate() { |
1528 | assert_eq!( |
1529 | p(r"a|b" ), |
1530 | Hir::alternation(vec![Hir::char('a' ), Hir::char('b' )]) |
1531 | ); |
1532 | assert_eq!( |
1533 | p(r"(?:a|b)" ), |
1534 | Hir::alternation(vec![Hir::char('a' ), Hir::char('b' )]) |
1535 | ); |
1536 | |
1537 | assert_eq!( |
1538 | p(r"(a|b)" ), |
1539 | cap(1, Hir::alternation(vec![Hir::char('a' ), Hir::char('b' )])) |
1540 | ); |
1541 | assert_eq!( |
1542 | p(r"(?<foo>a|b)" ), |
1543 | named_cap( |
1544 | 1, |
1545 | "foo" , |
1546 | Hir::alternation(vec![Hir::char('a' ), Hir::char('b' )]) |
1547 | ) |
1548 | ); |
1549 | |
1550 | assert_eq!( |
1551 | p(r"a|b|c" ), |
1552 | Hir::alternation(vec![ |
1553 | Hir::char('a' ), |
1554 | Hir::char('b' ), |
1555 | Hir::char('c' ) |
1556 | ]) |
1557 | ); |
1558 | |
1559 | assert_eq!( |
1560 | p(r"ax|by|cz" ), |
1561 | Hir::alternation(vec![ |
1562 | Hir::concat(vec![Hir::char('a' ), Hir::char('x' )]), |
1563 | Hir::concat(vec![Hir::char('b' ), Hir::char('y' )]), |
1564 | Hir::concat(vec![Hir::char('c' ), Hir::char('z' )]), |
1565 | ]) |
1566 | ); |
1567 | assert_eq!( |
1568 | p(r"(ax|(by|(cz)))" ), |
1569 | cap( |
1570 | 1, |
1571 | Hir::alternation(vec![ |
1572 | Hir::concat(vec![Hir::char('a' ), Hir::char('x' )]), |
1573 | cap( |
1574 | 2, |
1575 | Hir::alternation(vec![ |
1576 | Hir::concat(vec![Hir::char('b' ), Hir::char('y' )]), |
1577 | cap( |
1578 | 3, |
1579 | Hir::concat(vec![ |
1580 | Hir::char('c' ), |
1581 | Hir::char('z' ) |
1582 | ]) |
1583 | ), |
1584 | ]) |
1585 | ), |
1586 | ]) |
1587 | ) |
1588 | ); |
1589 | |
1590 | assert_eq!( |
1591 | p(r"|" ), |
1592 | Hir::alternation(vec![Hir::empty(), Hir::empty()]) |
1593 | ); |
1594 | assert_eq!( |
1595 | p(r"||" ), |
1596 | Hir::alternation(vec![Hir::empty(), Hir::empty(), Hir::empty()]) |
1597 | ); |
1598 | |
1599 | assert_eq!( |
1600 | p(r"a|" ), |
1601 | Hir::alternation(vec![Hir::char('a' ), Hir::empty()]) |
1602 | ); |
1603 | assert_eq!( |
1604 | p(r"|a" ), |
1605 | Hir::alternation(vec![Hir::empty(), Hir::char('a' )]) |
1606 | ); |
1607 | |
1608 | assert_eq!( |
1609 | p(r"(|)" ), |
1610 | cap(1, Hir::alternation(vec![Hir::empty(), Hir::empty()])) |
1611 | ); |
1612 | assert_eq!( |
1613 | p(r"(a|)" ), |
1614 | cap(1, Hir::alternation(vec![Hir::char('a' ), Hir::empty()])) |
1615 | ); |
1616 | assert_eq!( |
1617 | p(r"(|a)" ), |
1618 | cap(1, Hir::alternation(vec![Hir::empty(), Hir::char('a' )])) |
1619 | ); |
1620 | } |
1621 | |
1622 | #[test ] |
1623 | fn ok_flag_group() { |
1624 | assert_eq!( |
1625 | p("a(?i:b)" ), |
1626 | Hir::concat(vec![Hir::char('a' ), singles(['B' , 'b' ])]) |
1627 | ); |
1628 | } |
1629 | |
1630 | #[test ] |
1631 | fn ok_flag_directive() { |
1632 | assert_eq!(p("(?i)a" ), singles(['A' , 'a' ])); |
1633 | assert_eq!(p("a(?i)" ), Hir::char('a' )); |
1634 | assert_eq!( |
1635 | p("a(?i)b" ), |
1636 | Hir::concat(vec![Hir::char('a' ), singles(['B' , 'b' ])]) |
1637 | ); |
1638 | assert_eq!( |
1639 | p("a(?i)a(?-i)a" ), |
1640 | Hir::concat(vec![ |
1641 | Hir::char('a' ), |
1642 | singles(['A' , 'a' ]), |
1643 | Hir::char('a' ), |
1644 | ]) |
1645 | ); |
1646 | assert_eq!( |
1647 | p("a(?:(?i)a)a" ), |
1648 | Hir::concat(vec![ |
1649 | Hir::char('a' ), |
1650 | singles(['A' , 'a' ]), |
1651 | Hir::char('a' ), |
1652 | ]) |
1653 | ); |
1654 | assert_eq!( |
1655 | p("a((?i)a)a" ), |
1656 | Hir::concat(vec![ |
1657 | Hir::char('a' ), |
1658 | cap(1, singles(['A' , 'a' ])), |
1659 | Hir::char('a' ), |
1660 | ]) |
1661 | ); |
1662 | } |
1663 | |
1664 | #[test ] |
1665 | fn ok_uncounted_repetition() { |
1666 | assert_eq!( |
1667 | p(r"a?" ), |
1668 | Hir::repetition(hir::Repetition { |
1669 | min: 0, |
1670 | max: Some(1), |
1671 | greedy: true, |
1672 | sub: Box::new(Hir::char('a' )), |
1673 | }), |
1674 | ); |
1675 | assert_eq!( |
1676 | p(r"a*" ), |
1677 | Hir::repetition(hir::Repetition { |
1678 | min: 0, |
1679 | max: None, |
1680 | greedy: true, |
1681 | sub: Box::new(Hir::char('a' )), |
1682 | }), |
1683 | ); |
1684 | assert_eq!( |
1685 | p(r"a+" ), |
1686 | Hir::repetition(hir::Repetition { |
1687 | min: 1, |
1688 | max: None, |
1689 | greedy: true, |
1690 | sub: Box::new(Hir::char('a' )), |
1691 | }), |
1692 | ); |
1693 | |
1694 | assert_eq!( |
1695 | p(r"a??" ), |
1696 | Hir::repetition(hir::Repetition { |
1697 | min: 0, |
1698 | max: Some(1), |
1699 | greedy: false, |
1700 | sub: Box::new(Hir::char('a' )), |
1701 | }), |
1702 | ); |
1703 | assert_eq!( |
1704 | p(r"a*?" ), |
1705 | Hir::repetition(hir::Repetition { |
1706 | min: 0, |
1707 | max: None, |
1708 | greedy: false, |
1709 | sub: Box::new(Hir::char('a' )), |
1710 | }), |
1711 | ); |
1712 | assert_eq!( |
1713 | p(r"a+?" ), |
1714 | Hir::repetition(hir::Repetition { |
1715 | min: 1, |
1716 | max: None, |
1717 | greedy: false, |
1718 | sub: Box::new(Hir::char('a' )), |
1719 | }), |
1720 | ); |
1721 | |
1722 | assert_eq!( |
1723 | p(r"a?b" ), |
1724 | Hir::concat(vec![ |
1725 | Hir::repetition(hir::Repetition { |
1726 | min: 0, |
1727 | max: Some(1), |
1728 | greedy: true, |
1729 | sub: Box::new(Hir::char('a' )), |
1730 | }), |
1731 | Hir::char('b' ), |
1732 | ]), |
1733 | ); |
1734 | |
1735 | assert_eq!( |
1736 | p(r"ab?" ), |
1737 | Hir::concat(vec![ |
1738 | Hir::char('a' ), |
1739 | Hir::repetition(hir::Repetition { |
1740 | min: 0, |
1741 | max: Some(1), |
1742 | greedy: true, |
1743 | sub: Box::new(Hir::char('b' )), |
1744 | }), |
1745 | ]), |
1746 | ); |
1747 | |
1748 | assert_eq!( |
1749 | p(r"(?:ab)?" ), |
1750 | Hir::repetition(hir::Repetition { |
1751 | min: 0, |
1752 | max: Some(1), |
1753 | greedy: true, |
1754 | sub: Box::new(Hir::concat(vec![ |
1755 | Hir::char('a' ), |
1756 | Hir::char('b' ) |
1757 | ])), |
1758 | }), |
1759 | ); |
1760 | |
1761 | assert_eq!( |
1762 | p(r"(ab)?" ), |
1763 | Hir::repetition(hir::Repetition { |
1764 | min: 0, |
1765 | max: Some(1), |
1766 | greedy: true, |
1767 | sub: Box::new(cap( |
1768 | 1, |
1769 | Hir::concat(vec![Hir::char('a' ), Hir::char('b' )]) |
1770 | )), |
1771 | }), |
1772 | ); |
1773 | |
1774 | assert_eq!( |
1775 | p(r"|a?" ), |
1776 | Hir::alternation(vec![ |
1777 | Hir::empty(), |
1778 | Hir::repetition(hir::Repetition { |
1779 | min: 0, |
1780 | max: Some(1), |
1781 | greedy: true, |
1782 | sub: Box::new(Hir::char('a' )), |
1783 | }) |
1784 | ]), |
1785 | ); |
1786 | } |
1787 | |
1788 | #[test ] |
1789 | fn ok_counted_repetition() { |
1790 | assert_eq!( |
1791 | p(r"a{5}" ), |
1792 | Hir::repetition(hir::Repetition { |
1793 | min: 5, |
1794 | max: Some(5), |
1795 | greedy: true, |
1796 | sub: Box::new(Hir::char('a' )), |
1797 | }), |
1798 | ); |
1799 | assert_eq!( |
1800 | p(r"a{5}?" ), |
1801 | Hir::repetition(hir::Repetition { |
1802 | min: 5, |
1803 | max: Some(5), |
1804 | greedy: false, |
1805 | sub: Box::new(Hir::char('a' )), |
1806 | }), |
1807 | ); |
1808 | |
1809 | assert_eq!( |
1810 | p(r"a{5,}" ), |
1811 | Hir::repetition(hir::Repetition { |
1812 | min: 5, |
1813 | max: None, |
1814 | greedy: true, |
1815 | sub: Box::new(Hir::char('a' )), |
1816 | }), |
1817 | ); |
1818 | |
1819 | assert_eq!( |
1820 | p(r"a{5,9}" ), |
1821 | Hir::repetition(hir::Repetition { |
1822 | min: 5, |
1823 | max: Some(9), |
1824 | greedy: true, |
1825 | sub: Box::new(Hir::char('a' )), |
1826 | }), |
1827 | ); |
1828 | |
1829 | assert_eq!( |
1830 | p(r"ab{5}c" ), |
1831 | Hir::concat(vec![ |
1832 | Hir::char('a' ), |
1833 | Hir::repetition(hir::Repetition { |
1834 | min: 5, |
1835 | max: Some(5), |
1836 | greedy: true, |
1837 | sub: Box::new(Hir::char('b' )), |
1838 | }), |
1839 | Hir::char('c' ), |
1840 | ]), |
1841 | ); |
1842 | |
1843 | assert_eq!( |
1844 | p(r"a{ 5 }" ), |
1845 | Hir::repetition(hir::Repetition { |
1846 | min: 5, |
1847 | max: Some(5), |
1848 | greedy: true, |
1849 | sub: Box::new(Hir::char('a' )), |
1850 | }), |
1851 | ); |
1852 | assert_eq!( |
1853 | p(r"a{ 5 , 9 }" ), |
1854 | Hir::repetition(hir::Repetition { |
1855 | min: 5, |
1856 | max: Some(9), |
1857 | greedy: true, |
1858 | sub: Box::new(Hir::char('a' )), |
1859 | }), |
1860 | ); |
1861 | } |
1862 | |
1863 | #[test ] |
1864 | fn ok_group_unnamed() { |
1865 | assert_eq!(p("(a)" ), cap(1, Hir::char('a' ))); |
1866 | assert_eq!( |
1867 | p("(ab)" ), |
1868 | cap(1, Hir::concat(vec![Hir::char('a' ), Hir::char('b' )])) |
1869 | ); |
1870 | } |
1871 | |
1872 | #[test ] |
1873 | fn ok_group_named() { |
1874 | assert_eq!(p("(?P<foo>a)" ), named_cap(1, "foo" , Hir::char('a' ))); |
1875 | assert_eq!(p("(?<foo>a)" ), named_cap(1, "foo" , Hir::char('a' ))); |
1876 | |
1877 | assert_eq!( |
1878 | p("(?P<foo>ab)" ), |
1879 | named_cap( |
1880 | 1, |
1881 | "foo" , |
1882 | Hir::concat(vec![Hir::char('a' ), Hir::char('b' )]) |
1883 | ) |
1884 | ); |
1885 | assert_eq!( |
1886 | p("(?<foo>ab)" ), |
1887 | named_cap( |
1888 | 1, |
1889 | "foo" , |
1890 | Hir::concat(vec![Hir::char('a' ), Hir::char('b' )]) |
1891 | ) |
1892 | ); |
1893 | |
1894 | assert_eq!(p(r"(?<a>z)" ), named_cap(1, "a" , Hir::char('z' ))); |
1895 | assert_eq!(p(r"(?P<a>z)" ), named_cap(1, "a" , Hir::char('z' ))); |
1896 | |
1897 | assert_eq!(p(r"(?<a_1>z)" ), named_cap(1, "a_1" , Hir::char('z' ))); |
1898 | assert_eq!(p(r"(?P<a_1>z)" ), named_cap(1, "a_1" , Hir::char('z' ))); |
1899 | |
1900 | assert_eq!(p(r"(?<a.1>z)" ), named_cap(1, "a.1" , Hir::char('z' ))); |
1901 | assert_eq!(p(r"(?P<a.1>z)" ), named_cap(1, "a.1" , Hir::char('z' ))); |
1902 | |
1903 | assert_eq!(p(r"(?<a[1]>z)" ), named_cap(1, "a[1]" , Hir::char('z' ))); |
1904 | assert_eq!(p(r"(?P<a[1]>z)" ), named_cap(1, "a[1]" , Hir::char('z' ))); |
1905 | |
1906 | assert_eq!(p(r"(?<a¾>z)" ), named_cap(1, "a¾" , Hir::char('z' ))); |
1907 | assert_eq!(p(r"(?P<a¾>z)" ), named_cap(1, "a¾" , Hir::char('z' ))); |
1908 | |
1909 | assert_eq!(p(r"(?<名字>z)" ), named_cap(1, "名字" , Hir::char('z' ))); |
1910 | assert_eq!(p(r"(?P<名字>z)" ), named_cap(1, "名字" , Hir::char('z' ))); |
1911 | } |
1912 | |
1913 | #[test ] |
1914 | fn ok_class() { |
1915 | assert_eq!(p(r"[a]" ), singles(['a' ])); |
1916 | assert_eq!(p(r"[a\]]" ), singles(['a' , ']' ])); |
1917 | assert_eq!(p(r"[a\-z]" ), singles(['a' , '-' , 'z' ])); |
1918 | assert_eq!(p(r"[ab]" ), class([('a' , 'b' )])); |
1919 | assert_eq!(p(r"[a-]" ), singles(['a' , '-' ])); |
1920 | assert_eq!(p(r"[-a]" ), singles(['a' , '-' ])); |
1921 | assert_eq!(p(r"[--a]" ), singles(['a' , '-' ])); |
1922 | assert_eq!(p(r"[---a]" ), singles(['a' , '-' ])); |
1923 | assert_eq!(p(r"[[:alnum:]]" ), posix("alnum" )); |
1924 | assert_eq!(p(r"[\w]" ), posix("word" )); |
1925 | assert_eq!(p(r"[a\wz]" ), posix("word" )); |
1926 | assert_eq!(p(r"[\s\S]" ), class([(' \x00' , ' \u{10FFFF}' )])); |
1927 | assert_eq!(p(r"[^\s\S]" ), Hir::fail()); |
1928 | assert_eq!(p(r"[a-cx-z]" ), class([('a' , 'c' ), ('x' , 'z' )])); |
1929 | assert_eq!(p(r"[☃-⛄]" ), class([('☃' , '⛄' )])); |
1930 | assert_eq!(p(r"[]]" ), singles([']' ])); |
1931 | assert_eq!(p(r"[]a]" ), singles([']' , 'a' ])); |
1932 | assert_eq!(p(r"[]\[]" ), singles(['[' , ']' ])); |
1933 | assert_eq!(p(r"[\[]" ), singles(['[' ])); |
1934 | |
1935 | assert_eq!(p(r"(?i)[a]" ), singles(['A' , 'a' ])); |
1936 | assert_eq!(p(r"(?i)[A]" ), singles(['A' , 'a' ])); |
1937 | assert_eq!(p(r"(?i)[k]" ), singles(['K' , 'k' ])); |
1938 | assert_eq!(p(r"(?i)[s]" ), singles(['S' , 's' ])); |
1939 | assert_eq!(p(r"(?i)[β]" ), singles(['β' ])); |
1940 | |
1941 | assert_eq!(p(r"[^^]" ), class([(' \x00' , ']' ), ('_' , ' \u{10FFFF}' )])); |
1942 | assert_eq!( |
1943 | p(r"[^-a]" ), |
1944 | class([(' \x00' , ',' ), ('.' , '`' ), ('b' , ' \u{10FFFF}' )]) |
1945 | ); |
1946 | |
1947 | assert_eq!( |
1948 | p(r"[-]a]" ), |
1949 | Hir::concat(vec![singles(['-' ]), Hir::char('a' ), Hir::char(']' )]) |
1950 | ); |
1951 | } |
1952 | |
1953 | #[test ] |
1954 | fn ok_verbatim() { |
1955 | assert_eq!( |
1956 | p(r"(?x)a{5,9} ?" ), |
1957 | Hir::repetition(hir::Repetition { |
1958 | min: 5, |
1959 | max: Some(9), |
1960 | greedy: false, |
1961 | sub: Box::new(Hir::char('a' )), |
1962 | }) |
1963 | ); |
1964 | assert_eq!(p(r"(?x)[ a]" ), singles(['a' ])); |
1965 | assert_eq!( |
1966 | p(r"(?x)[ ^ a]" ), |
1967 | class([(' \x00' , '`' ), ('b' , ' \u{10FFFF}' )]) |
1968 | ); |
1969 | assert_eq!(p(r"(?x)[ - a]" ), singles(['a' , '-' ])); |
1970 | assert_eq!(p(r"(?x)[ ] a]" ), singles([']' , 'a' ])); |
1971 | |
1972 | assert_eq!( |
1973 | p(r"(?x)a b" ), |
1974 | Hir::concat(vec![Hir::char('a' ), Hir::char('b' )]) |
1975 | ); |
1976 | assert_eq!( |
1977 | p(r"(?x)a b(?-x)a b" ), |
1978 | Hir::concat(vec![ |
1979 | Hir::char('a' ), |
1980 | Hir::char('b' ), |
1981 | Hir::char('a' ), |
1982 | Hir::char(' ' ), |
1983 | Hir::char('b' ), |
1984 | ]) |
1985 | ); |
1986 | assert_eq!( |
1987 | p(r"a (?x:a )a " ), |
1988 | Hir::concat(vec![ |
1989 | Hir::char('a' ), |
1990 | Hir::char(' ' ), |
1991 | Hir::char('a' ), |
1992 | Hir::char('a' ), |
1993 | Hir::char(' ' ), |
1994 | ]) |
1995 | ); |
1996 | assert_eq!( |
1997 | p(r"(?x)( ?P<foo> a )" ), |
1998 | named_cap(1, "foo" , Hir::char('a' )), |
1999 | ); |
2000 | assert_eq!(p(r"(?x)( a )" ), cap(1, Hir::char('a' ))); |
2001 | assert_eq!(p(r"(?x)( ?: a )" ), Hir::char('a' )); |
2002 | assert_eq!(p(r"(?x)\x { 53 }" ), Hir::char(' \x53' )); |
2003 | assert_eq!(p(r"(?x)\ " ), Hir::char(' ' )); |
2004 | } |
2005 | |
2006 | #[test ] |
2007 | fn ok_comments() { |
2008 | let pat = "(?x) |
2009 | # This is comment 1. |
2010 | foo # This is comment 2. |
2011 | # This is comment 3. |
2012 | bar |
2013 | # This is comment 4." ; |
2014 | assert_eq!( |
2015 | p(pat), |
2016 | Hir::concat(vec![ |
2017 | Hir::char('f' ), |
2018 | Hir::char('o' ), |
2019 | Hir::char('o' ), |
2020 | Hir::char('b' ), |
2021 | Hir::char('a' ), |
2022 | Hir::char('r' ), |
2023 | ]) |
2024 | ); |
2025 | } |
2026 | |
2027 | #[test ] |
2028 | fn err_standard() { |
2029 | assert_eq!( |
2030 | ERR_TOO_MUCH_NESTING, |
2031 | perr("(((((((((((((((((((((((((((((((((((((((((((((((((((a)))))))))))))))))))))))))))))))))))))))))))))))))))" ), |
2032 | ); |
2033 | // This one is tricky, because the only way it can happen is if the |
2034 | // number of captures overflows u32. Perhaps we should allow setting a |
2035 | // lower limit? |
2036 | // assert_eq!(ERR_TOO_MANY_CAPTURES, perr("")); |
2037 | assert_eq!(ERR_DUPLICATE_CAPTURE_NAME, perr(r"(?P<a>y)(?P<a>z)" )); |
2038 | assert_eq!(ERR_UNCLOSED_GROUP, perr("(" )); |
2039 | assert_eq!(ERR_UNCLOSED_GROUP_QUESTION, perr("(?" )); |
2040 | assert_eq!(ERR_UNOPENED_GROUP, perr(")" )); |
2041 | assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?=a)" )); |
2042 | assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?!a)" )); |
2043 | assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<=a)" )); |
2044 | assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<!a)" )); |
2045 | assert_eq!(ERR_EMPTY_FLAGS, perr(r"(?)" )); |
2046 | assert_eq!(ERR_MISSING_GROUP_NAME, perr(r"(?P<" )); |
2047 | assert_eq!(ERR_MISSING_GROUP_NAME, perr(r"(?<" )); |
2048 | assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?P<1abc>z)" )); |
2049 | assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<1abc>z)" )); |
2050 | assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾>z)" )); |
2051 | assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾a>z)" )); |
2052 | assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<☃>z)" )); |
2053 | assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<a☃>z)" )); |
2054 | assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?P<foo" )); |
2055 | assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?<foo" )); |
2056 | assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?P<>z)" )); |
2057 | assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?<>z)" )); |
2058 | assert_eq!(ERR_FLAG_UNRECOGNIZED, perr(r"(?z:foo)" )); |
2059 | assert_eq!(ERR_FLAG_REPEATED_NEGATION, perr(r"(?s-i-R)" )); |
2060 | assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?isi)" )); |
2061 | assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?is-i)" )); |
2062 | assert_eq!(ERR_FLAG_UNEXPECTED_EOF, perr(r"(?is" )); |
2063 | assert_eq!(ERR_FLAG_DANGLING_NEGATION, perr(r"(?is-:foo)" )); |
2064 | assert_eq!(ERR_HEX_BRACE_INVALID_DIGIT, perr(r"\x{Z}" )); |
2065 | assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{" )); |
2066 | assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{A" )); |
2067 | assert_eq!(ERR_HEX_BRACE_EMPTY, perr(r"\x{}" )); |
2068 | assert_eq!(ERR_HEX_BRACE_INVALID, perr(r"\x{FFFFFFFFFFFFFFFFF}" )); |
2069 | assert_eq!(ERR_HEX_FIXED_UNEXPECTED_EOF, perr(r"\xA" )); |
2070 | assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZ" )); |
2071 | assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZA" )); |
2072 | assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xAZ" )); |
2073 | assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\uD800" )); |
2074 | assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\UFFFFFFFF" )); |
2075 | assert_eq!(ERR_HEX_UNEXPECTED_EOF, perr(r"\x" )); |
2076 | assert_eq!(ERR_ESCAPE_UNEXPECTED_EOF, perr(r"\" )); |
2077 | assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\0" )); |
2078 | assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\1" )); |
2079 | assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\8" )); |
2080 | assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL" )); |
2081 | assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}" )); |
2082 | assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i" )); |
2083 | assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?" )); |
2084 | assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*" )); |
2085 | assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+" )); |
2086 | assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(+)" )); |
2087 | assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"|?" )); |
2088 | assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(?i)?" )); |
2089 | assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"{5}" )); |
2090 | assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"({5})" )); |
2091 | assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"(?i){5}" )); |
2092 | assert_eq!(ERR_COUNTED_REP_UNCLOSED, perr(r"a{" )); |
2093 | assert_eq!(ERR_COUNTED_REP_MIN_UNCLOSED, perr(r"a{5" )); |
2094 | assert_eq!(ERR_COUNTED_REP_COMMA_UNCLOSED, perr(r"a{5," )); |
2095 | assert_eq!(ERR_COUNTED_REP_MIN_MAX_UNCLOSED, perr(r"a{5,6" )); |
2096 | assert_eq!(ERR_COUNTED_REP_INVALID, perr(r"a{5,6Z" )); |
2097 | assert_eq!(ERR_COUNTED_REP_INVALID_RANGE, perr(r"a{6,5}" )); |
2098 | assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{}" )); |
2099 | assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{]}" )); |
2100 | assert_eq!(ERR_DECIMAL_INVALID, perr(r"a{999999999999999}" )); |
2101 | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"[a" )); |
2102 | assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[\w-a]" )); |
2103 | assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[a-\w]" )); |
2104 | assert_eq!(ERR_CLASS_INVALID_ITEM, perr(r"[\b]" )); |
2105 | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"[a-" )); |
2106 | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_NEGATION, perr(r"[^" )); |
2107 | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_CLOSING, perr(r"[]" )); |
2108 | assert_eq!(ERR_CLASS_INVALID_RANGE, perr(r"[z-a]" )); |
2109 | assert_eq!(ERR_CLASS_UNCLOSED, perr(r"[" )); |
2110 | assert_eq!(ERR_CLASS_UNCLOSED, perr(r"[a-z" )); |
2111 | assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[a-z[A-Z]]" )); |
2112 | assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[[:alnum]]" )); |
2113 | assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]" )); |
2114 | assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]" )); |
2115 | assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]" )); |
2116 | assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo" )); |
2117 | assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo!}" )); |
2118 | assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED, perr(r"\b{foo}" )); |
2119 | assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"\b{" )); |
2120 | assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"(?x)\b{ " )); |
2121 | } |
2122 | |
2123 | #[test ] |
2124 | fn err_verbatim() { |
2125 | // See: https://github.com/rust-lang/regex/issues/792 |
2126 | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[-#]" )); |
2127 | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"(?x)[a " )); |
2128 | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[a- " )); |
2129 | assert_eq!(ERR_CLASS_UNCLOSED, perr(r"(?x)[ " )); |
2130 | } |
2131 | |
2132 | // This tests a bug fix where the nest limit checker wasn't decrementing |
2133 | // its depth during post-traversal, which causes long regexes to trip |
2134 | // the default limit too aggressively. |
2135 | #[test ] |
2136 | fn regression_454_nest_too_big() { |
2137 | let pattern = r#" |
2138 | 2(?: |
2139 | [45]\d{3}| |
2140 | 7(?: |
2141 | 1[0-267]| |
2142 | 2[0-289]| |
2143 | 3[0-29]| |
2144 | 4[01]| |
2145 | 5[1-3]| |
2146 | 6[013]| |
2147 | 7[0178]| |
2148 | 91 |
2149 | )| |
2150 | 8(?: |
2151 | 0[125]| |
2152 | [139][1-6]| |
2153 | 2[0157-9]| |
2154 | 41| |
2155 | 6[1-35]| |
2156 | 7[1-5]| |
2157 | 8[1-8]| |
2158 | 90 |
2159 | )| |
2160 | 9(?: |
2161 | 0[0-2]| |
2162 | 1[0-4]| |
2163 | 2[568]| |
2164 | 3[3-6]| |
2165 | 5[5-7]| |
2166 | 6[0167]| |
2167 | 7[15]| |
2168 | 8[0146-9] |
2169 | ) |
2170 | )\d{4} |
2171 | "# ; |
2172 | p(pattern); |
2173 | } |
2174 | |
2175 | // This tests that we treat a trailing `-` in a character class as a |
2176 | // literal `-` even when whitespace mode is enabled and there is whitespace |
2177 | // after the trailing `-`. |
2178 | #[test ] |
2179 | fn regression_455_trailing_dash_ignore_whitespace() { |
2180 | p("(?x)[ / - ]" ); |
2181 | p("(?x)[ a - ]" ); |
2182 | p("(?x)[ |
2183 | a |
2184 | - ] |
2185 | " ); |
2186 | p("(?x)[ |
2187 | a # wat |
2188 | - ] |
2189 | " ); |
2190 | |
2191 | perr("(?x)[ / -" ); |
2192 | perr("(?x)[ / - " ); |
2193 | perr( |
2194 | "(?x)[ |
2195 | / - |
2196 | " , |
2197 | ); |
2198 | perr( |
2199 | "(?x)[ |
2200 | / - # wat |
2201 | " , |
2202 | ); |
2203 | } |
2204 | |
2205 | #[test ] |
2206 | fn regression_capture_indices() { |
2207 | let got = p(r"(a|ab|c|bcd){4,10}(d*)" ); |
2208 | assert_eq!( |
2209 | got, |
2210 | Hir::concat(vec![ |
2211 | Hir::repetition(hir::Repetition { |
2212 | min: 4, |
2213 | max: Some(10), |
2214 | greedy: true, |
2215 | sub: Box::new(cap( |
2216 | 1, |
2217 | Hir::alternation(vec![ |
2218 | Hir::char('a' ), |
2219 | Hir::concat(vec![Hir::char('a' ), Hir::char('b' )]), |
2220 | Hir::char('c' ), |
2221 | Hir::concat(vec![ |
2222 | Hir::char('b' ), |
2223 | Hir::char('c' ), |
2224 | Hir::char('d' ) |
2225 | ]), |
2226 | ]) |
2227 | )) |
2228 | }), |
2229 | cap( |
2230 | 2, |
2231 | Hir::repetition(hir::Repetition { |
2232 | min: 0, |
2233 | max: None, |
2234 | greedy: true, |
2235 | sub: Box::new(Hir::char('d' )), |
2236 | }) |
2237 | ), |
2238 | ]) |
2239 | ); |
2240 | } |
2241 | } |
2242 | |