1 | /*! |
2 | Defines a translator that converts an `Ast` to an `Hir`. |
3 | */ |
4 | |
5 | use core::cell::{Cell, RefCell}; |
6 | |
7 | use alloc::{boxed::Box, string::ToString, vec, vec::Vec}; |
8 | |
9 | use crate::{ |
10 | ast::{self, Ast, Span, Visitor}, |
11 | either::Either, |
12 | hir::{self, Error, ErrorKind, Hir, HirKind}, |
13 | unicode::{self, ClassQuery}, |
14 | }; |
15 | |
16 | type Result<T> = core::result::Result<T, Error>; |
17 | |
18 | /// A builder for constructing an AST->HIR translator. |
19 | #[derive (Clone, Debug)] |
20 | pub struct TranslatorBuilder { |
21 | utf8: bool, |
22 | flags: Flags, |
23 | } |
24 | |
25 | impl Default for TranslatorBuilder { |
26 | fn default() -> TranslatorBuilder { |
27 | TranslatorBuilder::new() |
28 | } |
29 | } |
30 | |
31 | impl TranslatorBuilder { |
32 | /// Create a new translator builder with a default c onfiguration. |
33 | pub fn new() -> TranslatorBuilder { |
34 | TranslatorBuilder { utf8: true, flags: Flags::default() } |
35 | } |
36 | |
37 | /// Build a translator using the current configuration. |
38 | pub fn build(&self) -> Translator { |
39 | Translator { |
40 | stack: RefCell::new(vec![]), |
41 | flags: Cell::new(self.flags), |
42 | utf8: self.utf8, |
43 | } |
44 | } |
45 | |
46 | /// When disabled, translation will permit the construction of a regular |
47 | /// expression that may match invalid UTF-8. |
48 | /// |
49 | /// When enabled (the default), the translator is guaranteed to produce an |
50 | /// expression that, for non-empty matches, will only ever produce spans |
51 | /// that are entirely valid UTF-8 (otherwise, the translator will return an |
52 | /// error). |
53 | /// |
54 | /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even |
55 | /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete |
56 | /// syntax) will be allowed even though they can produce matches that split |
57 | /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty" |
58 | /// matches, and it is expected that the regex engine itself must handle |
59 | /// these cases if necessary (perhaps by suppressing any zero-width matches |
60 | /// that split a codepoint). |
61 | pub fn utf8(&mut self, yes: bool) -> &mut TranslatorBuilder { |
62 | self.utf8 = yes; |
63 | self |
64 | } |
65 | |
66 | /// Enable or disable the case insensitive flag (`i`) by default. |
67 | pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder { |
68 | self.flags.case_insensitive = if yes { Some(true) } else { None }; |
69 | self |
70 | } |
71 | |
72 | /// Enable or disable the multi-line matching flag (`m`) by default. |
73 | pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder { |
74 | self.flags.multi_line = if yes { Some(true) } else { None }; |
75 | self |
76 | } |
77 | |
78 | /// Enable or disable the "dot matches any character" flag (`s`) by |
79 | /// default. |
80 | pub fn dot_matches_new_line( |
81 | &mut self, |
82 | yes: bool, |
83 | ) -> &mut TranslatorBuilder { |
84 | self.flags.dot_matches_new_line = if yes { Some(true) } else { None }; |
85 | self |
86 | } |
87 | |
88 | /// Enable or disable the CRLF mode flag (`R`) by default. |
89 | pub fn crlf(&mut self, yes: bool) -> &mut TranslatorBuilder { |
90 | self.flags.crlf = if yes { Some(true) } else { None }; |
91 | self |
92 | } |
93 | |
94 | /// Enable or disable the "swap greed" flag (`U`) by default. |
95 | pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder { |
96 | self.flags.swap_greed = if yes { Some(true) } else { None }; |
97 | self |
98 | } |
99 | |
100 | /// Enable or disable the Unicode flag (`u`) by default. |
101 | pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder { |
102 | self.flags.unicode = if yes { None } else { Some(false) }; |
103 | self |
104 | } |
105 | } |
106 | |
107 | /// A translator maps abstract syntax to a high level intermediate |
108 | /// representation. |
109 | /// |
110 | /// A translator may be benefit from reuse. That is, a translator can translate |
111 | /// many abstract syntax trees. |
112 | /// |
113 | /// A `Translator` can be configured in more detail via a |
114 | /// [`TranslatorBuilder`]. |
115 | #[derive (Clone, Debug)] |
116 | pub struct Translator { |
117 | /// Our call stack, but on the heap. |
118 | stack: RefCell<Vec<HirFrame>>, |
119 | /// The current flag settings. |
120 | flags: Cell<Flags>, |
121 | /// Whether we're allowed to produce HIR that can match arbitrary bytes. |
122 | utf8: bool, |
123 | } |
124 | |
125 | impl Translator { |
126 | /// Create a new translator using the default configuration. |
127 | pub fn new() -> Translator { |
128 | TranslatorBuilder::new().build() |
129 | } |
130 | |
131 | /// Translate the given abstract syntax tree (AST) into a high level |
132 | /// intermediate representation (HIR). |
133 | /// |
134 | /// If there was a problem doing the translation, then an HIR-specific |
135 | /// error is returned. |
136 | /// |
137 | /// The original pattern string used to produce the `Ast` *must* also be |
138 | /// provided. The translator does not use the pattern string during any |
139 | /// correct translation, but is used for error reporting. |
140 | pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir> { |
141 | ast::visit(ast, visitor:TranslatorI::new(self, pattern)) |
142 | } |
143 | } |
144 | |
145 | /// An HirFrame is a single stack frame, represented explicitly, which is |
146 | /// created for each item in the Ast that we traverse. |
147 | /// |
148 | /// Note that technically, this type doesn't represent our entire stack |
149 | /// frame. In particular, the Ast visitor represents any state associated with |
150 | /// traversing the Ast itself. |
151 | #[derive (Clone, Debug)] |
152 | enum HirFrame { |
153 | /// An arbitrary HIR expression. These get pushed whenever we hit a base |
154 | /// case in the Ast. They get popped after an inductive (i.e., recursive) |
155 | /// step is complete. |
156 | Expr(Hir), |
157 | /// A literal that is being constructed, character by character, from the |
158 | /// AST. We need this because the AST gives each individual character its |
159 | /// own node. So as we see characters, we peek at the top-most HirFrame. |
160 | /// If it's a literal, then we add to it. Otherwise, we push a new literal. |
161 | /// When it comes time to pop it, we convert it to an Hir via Hir::literal. |
162 | Literal(Vec<u8>), |
163 | /// A Unicode character class. This frame is mutated as we descend into |
164 | /// the Ast of a character class (which is itself its own mini recursive |
165 | /// structure). |
166 | ClassUnicode(hir::ClassUnicode), |
167 | /// A byte-oriented character class. This frame is mutated as we descend |
168 | /// into the Ast of a character class (which is itself its own mini |
169 | /// recursive structure). |
170 | /// |
171 | /// Byte character classes are created when Unicode mode (`u`) is disabled. |
172 | /// If `utf8` is enabled (the default), then a byte character is only |
173 | /// permitted to match ASCII text. |
174 | ClassBytes(hir::ClassBytes), |
175 | /// This is pushed whenever a repetition is observed. After visiting every |
176 | /// sub-expression in the repetition, the translator's stack is expected to |
177 | /// have this sentinel at the top. |
178 | /// |
179 | /// This sentinel only exists to stop other things (like flattening |
180 | /// literals) from reaching across repetition operators. |
181 | Repetition, |
182 | /// This is pushed on to the stack upon first seeing any kind of capture, |
183 | /// indicated by parentheses (including non-capturing groups). It is popped |
184 | /// upon leaving a group. |
185 | Group { |
186 | /// The old active flags when this group was opened. |
187 | /// |
188 | /// If this group sets flags, then the new active flags are set to the |
189 | /// result of merging the old flags with the flags introduced by this |
190 | /// group. If the group doesn't set any flags, then this is simply |
191 | /// equivalent to whatever flags were set when the group was opened. |
192 | /// |
193 | /// When this group is popped, the active flags should be restored to |
194 | /// the flags set here. |
195 | /// |
196 | /// The "active" flags correspond to whatever flags are set in the |
197 | /// Translator. |
198 | old_flags: Flags, |
199 | }, |
200 | /// This is pushed whenever a concatenation is observed. After visiting |
201 | /// every sub-expression in the concatenation, the translator's stack is |
202 | /// popped until it sees a Concat frame. |
203 | Concat, |
204 | /// This is pushed whenever an alternation is observed. After visiting |
205 | /// every sub-expression in the alternation, the translator's stack is |
206 | /// popped until it sees an Alternation frame. |
207 | Alternation, |
208 | /// This is pushed immediately before each sub-expression in an |
209 | /// alternation. This separates the branches of an alternation on the |
210 | /// stack and prevents literal flattening from reaching across alternation |
211 | /// branches. |
212 | /// |
213 | /// It is popped after each expression in a branch until an 'Alternation' |
214 | /// frame is observed when doing a post visit on an alternation. |
215 | AlternationBranch, |
216 | } |
217 | |
218 | impl HirFrame { |
219 | /// Assert that the current stack frame is an Hir expression and return it. |
220 | fn unwrap_expr(self) -> Hir { |
221 | match self { |
222 | HirFrame::Expr(expr) => expr, |
223 | HirFrame::Literal(lit) => Hir::literal(lit), |
224 | _ => panic!("tried to unwrap expr from HirFrame, got: {:?}" , self), |
225 | } |
226 | } |
227 | |
228 | /// Assert that the current stack frame is a Unicode class expression and |
229 | /// return it. |
230 | fn unwrap_class_unicode(self) -> hir::ClassUnicode { |
231 | match self { |
232 | HirFrame::ClassUnicode(cls) => cls, |
233 | _ => panic!( |
234 | "tried to unwrap Unicode class \ |
235 | from HirFrame, got: {:?}" , |
236 | self |
237 | ), |
238 | } |
239 | } |
240 | |
241 | /// Assert that the current stack frame is a byte class expression and |
242 | /// return it. |
243 | fn unwrap_class_bytes(self) -> hir::ClassBytes { |
244 | match self { |
245 | HirFrame::ClassBytes(cls) => cls, |
246 | _ => panic!( |
247 | "tried to unwrap byte class \ |
248 | from HirFrame, got: {:?}" , |
249 | self |
250 | ), |
251 | } |
252 | } |
253 | |
254 | /// Assert that the current stack frame is a repetition sentinel. If it |
255 | /// isn't, then panic. |
256 | fn unwrap_repetition(self) { |
257 | match self { |
258 | HirFrame::Repetition => {} |
259 | _ => { |
260 | panic!( |
261 | "tried to unwrap repetition from HirFrame, got: {:?}" , |
262 | self |
263 | ) |
264 | } |
265 | } |
266 | } |
267 | |
268 | /// Assert that the current stack frame is a group indicator and return |
269 | /// its corresponding flags (the flags that were active at the time the |
270 | /// group was entered). |
271 | fn unwrap_group(self) -> Flags { |
272 | match self { |
273 | HirFrame::Group { old_flags } => old_flags, |
274 | _ => { |
275 | panic!("tried to unwrap group from HirFrame, got: {:?}" , self) |
276 | } |
277 | } |
278 | } |
279 | |
280 | /// Assert that the current stack frame is an alternation pipe sentinel. If |
281 | /// it isn't, then panic. |
282 | fn unwrap_alternation_pipe(self) { |
283 | match self { |
284 | HirFrame::AlternationBranch => {} |
285 | _ => { |
286 | panic!( |
287 | "tried to unwrap alt pipe from HirFrame, got: {:?}" , |
288 | self |
289 | ) |
290 | } |
291 | } |
292 | } |
293 | } |
294 | |
295 | impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { |
296 | type Output = Hir; |
297 | type Err = Error; |
298 | |
299 | fn finish(self) -> Result<Hir> { |
300 | // ... otherwise, we should have exactly one HIR on the stack. |
301 | assert_eq!(self.trans().stack.borrow().len(), 1); |
302 | Ok(self.pop().unwrap().unwrap_expr()) |
303 | } |
304 | |
305 | fn visit_pre(&mut self, ast: &Ast) -> Result<()> { |
306 | match *ast { |
307 | Ast::Class(ast::Class::Bracketed(_)) => { |
308 | if self.flags().unicode() { |
309 | let cls = hir::ClassUnicode::empty(); |
310 | self.push(HirFrame::ClassUnicode(cls)); |
311 | } else { |
312 | let cls = hir::ClassBytes::empty(); |
313 | self.push(HirFrame::ClassBytes(cls)); |
314 | } |
315 | } |
316 | Ast::Repetition(_) => self.push(HirFrame::Repetition), |
317 | Ast::Group(ref x) => { |
318 | let old_flags = x |
319 | .flags() |
320 | .map(|ast| self.set_flags(ast)) |
321 | .unwrap_or_else(|| self.flags()); |
322 | self.push(HirFrame::Group { old_flags }); |
323 | } |
324 | Ast::Concat(ref x) if x.asts.is_empty() => {} |
325 | Ast::Concat(_) => { |
326 | self.push(HirFrame::Concat); |
327 | } |
328 | Ast::Alternation(ref x) if x.asts.is_empty() => {} |
329 | Ast::Alternation(_) => { |
330 | self.push(HirFrame::Alternation); |
331 | self.push(HirFrame::AlternationBranch); |
332 | } |
333 | _ => {} |
334 | } |
335 | Ok(()) |
336 | } |
337 | |
338 | fn visit_post(&mut self, ast: &Ast) -> Result<()> { |
339 | match *ast { |
340 | Ast::Empty(_) => { |
341 | self.push(HirFrame::Expr(Hir::empty())); |
342 | } |
343 | Ast::Flags(ref x) => { |
344 | self.set_flags(&x.flags); |
345 | // Flags in the AST are generally considered directives and |
346 | // not actual sub-expressions. However, they can be used in |
347 | // the concrete syntax like `((?i))`, and we need some kind of |
348 | // indication of an expression there, and Empty is the correct |
349 | // choice. |
350 | // |
351 | // There can also be things like `(?i)+`, but we rule those out |
352 | // in the parser. In the future, we might allow them for |
353 | // consistency sake. |
354 | self.push(HirFrame::Expr(Hir::empty())); |
355 | } |
356 | Ast::Literal(ref x) => { |
357 | match self.ast_literal_to_scalar(x)? { |
358 | Either::Right(byte) => self.push_byte(byte), |
359 | Either::Left(ch) => { |
360 | if !self.flags().unicode() && ch.len_utf8() > 1 { |
361 | return Err(self |
362 | .error(x.span, ErrorKind::UnicodeNotAllowed)); |
363 | } |
364 | match self.case_fold_char(x.span, ch)? { |
365 | None => self.push_char(ch), |
366 | Some(expr) => self.push(HirFrame::Expr(expr)), |
367 | } |
368 | } |
369 | } |
370 | // self.push(HirFrame::Expr(self.hir_literal(x)?)); |
371 | } |
372 | Ast::Dot(span) => { |
373 | self.push(HirFrame::Expr(self.hir_dot(span)?)); |
374 | } |
375 | Ast::Assertion(ref x) => { |
376 | self.push(HirFrame::Expr(self.hir_assertion(x)?)); |
377 | } |
378 | Ast::Class(ast::Class::Perl(ref x)) => { |
379 | if self.flags().unicode() { |
380 | let cls = self.hir_perl_unicode_class(x)?; |
381 | let hcls = hir::Class::Unicode(cls); |
382 | self.push(HirFrame::Expr(Hir::class(hcls))); |
383 | } else { |
384 | let cls = self.hir_perl_byte_class(x)?; |
385 | let hcls = hir::Class::Bytes(cls); |
386 | self.push(HirFrame::Expr(Hir::class(hcls))); |
387 | } |
388 | } |
389 | Ast::Class(ast::Class::Unicode(ref x)) => { |
390 | let cls = hir::Class::Unicode(self.hir_unicode_class(x)?); |
391 | self.push(HirFrame::Expr(Hir::class(cls))); |
392 | } |
393 | Ast::Class(ast::Class::Bracketed(ref ast)) => { |
394 | if self.flags().unicode() { |
395 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
396 | self.unicode_fold_and_negate( |
397 | &ast.span, |
398 | ast.negated, |
399 | &mut cls, |
400 | )?; |
401 | let expr = Hir::class(hir::Class::Unicode(cls)); |
402 | self.push(HirFrame::Expr(expr)); |
403 | } else { |
404 | let mut cls = self.pop().unwrap().unwrap_class_bytes(); |
405 | self.bytes_fold_and_negate( |
406 | &ast.span, |
407 | ast.negated, |
408 | &mut cls, |
409 | )?; |
410 | let expr = Hir::class(hir::Class::Bytes(cls)); |
411 | self.push(HirFrame::Expr(expr)); |
412 | } |
413 | } |
414 | Ast::Repetition(ref x) => { |
415 | let expr = self.pop().unwrap().unwrap_expr(); |
416 | self.pop().unwrap().unwrap_repetition(); |
417 | self.push(HirFrame::Expr(self.hir_repetition(x, expr))); |
418 | } |
419 | Ast::Group(ref x) => { |
420 | let expr = self.pop().unwrap().unwrap_expr(); |
421 | let old_flags = self.pop().unwrap().unwrap_group(); |
422 | self.trans().flags.set(old_flags); |
423 | self.push(HirFrame::Expr(self.hir_capture(x, expr))); |
424 | } |
425 | Ast::Concat(_) => { |
426 | let mut exprs = vec![]; |
427 | while let Some(expr) = self.pop_concat_expr() { |
428 | if !matches!(*expr.kind(), HirKind::Empty) { |
429 | exprs.push(expr); |
430 | } |
431 | } |
432 | exprs.reverse(); |
433 | self.push(HirFrame::Expr(Hir::concat(exprs))); |
434 | } |
435 | Ast::Alternation(_) => { |
436 | let mut exprs = vec![]; |
437 | while let Some(expr) = self.pop_alt_expr() { |
438 | self.pop().unwrap().unwrap_alternation_pipe(); |
439 | exprs.push(expr); |
440 | } |
441 | exprs.reverse(); |
442 | self.push(HirFrame::Expr(Hir::alternation(exprs))); |
443 | } |
444 | } |
445 | Ok(()) |
446 | } |
447 | |
448 | fn visit_alternation_in(&mut self) -> Result<()> { |
449 | self.push(HirFrame::AlternationBranch); |
450 | Ok(()) |
451 | } |
452 | |
453 | fn visit_class_set_item_pre( |
454 | &mut self, |
455 | ast: &ast::ClassSetItem, |
456 | ) -> Result<()> { |
457 | match *ast { |
458 | ast::ClassSetItem::Bracketed(_) => { |
459 | if self.flags().unicode() { |
460 | let cls = hir::ClassUnicode::empty(); |
461 | self.push(HirFrame::ClassUnicode(cls)); |
462 | } else { |
463 | let cls = hir::ClassBytes::empty(); |
464 | self.push(HirFrame::ClassBytes(cls)); |
465 | } |
466 | } |
467 | // We needn't handle the Union case here since the visitor will |
468 | // do it for us. |
469 | _ => {} |
470 | } |
471 | Ok(()) |
472 | } |
473 | |
474 | fn visit_class_set_item_post( |
475 | &mut self, |
476 | ast: &ast::ClassSetItem, |
477 | ) -> Result<()> { |
478 | match *ast { |
479 | ast::ClassSetItem::Empty(_) => {} |
480 | ast::ClassSetItem::Literal(ref x) => { |
481 | if self.flags().unicode() { |
482 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
483 | cls.push(hir::ClassUnicodeRange::new(x.c, x.c)); |
484 | self.push(HirFrame::ClassUnicode(cls)); |
485 | } else { |
486 | let mut cls = self.pop().unwrap().unwrap_class_bytes(); |
487 | let byte = self.class_literal_byte(x)?; |
488 | cls.push(hir::ClassBytesRange::new(byte, byte)); |
489 | self.push(HirFrame::ClassBytes(cls)); |
490 | } |
491 | } |
492 | ast::ClassSetItem::Range(ref x) => { |
493 | if self.flags().unicode() { |
494 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
495 | cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c)); |
496 | self.push(HirFrame::ClassUnicode(cls)); |
497 | } else { |
498 | let mut cls = self.pop().unwrap().unwrap_class_bytes(); |
499 | let start = self.class_literal_byte(&x.start)?; |
500 | let end = self.class_literal_byte(&x.end)?; |
501 | cls.push(hir::ClassBytesRange::new(start, end)); |
502 | self.push(HirFrame::ClassBytes(cls)); |
503 | } |
504 | } |
505 | ast::ClassSetItem::Ascii(ref x) => { |
506 | if self.flags().unicode() { |
507 | let xcls = self.hir_ascii_unicode_class(x)?; |
508 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
509 | cls.union(&xcls); |
510 | self.push(HirFrame::ClassUnicode(cls)); |
511 | } else { |
512 | let xcls = self.hir_ascii_byte_class(x)?; |
513 | let mut cls = self.pop().unwrap().unwrap_class_bytes(); |
514 | cls.union(&xcls); |
515 | self.push(HirFrame::ClassBytes(cls)); |
516 | } |
517 | } |
518 | ast::ClassSetItem::Unicode(ref x) => { |
519 | let xcls = self.hir_unicode_class(x)?; |
520 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
521 | cls.union(&xcls); |
522 | self.push(HirFrame::ClassUnicode(cls)); |
523 | } |
524 | ast::ClassSetItem::Perl(ref x) => { |
525 | if self.flags().unicode() { |
526 | let xcls = self.hir_perl_unicode_class(x)?; |
527 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
528 | cls.union(&xcls); |
529 | self.push(HirFrame::ClassUnicode(cls)); |
530 | } else { |
531 | let xcls = self.hir_perl_byte_class(x)?; |
532 | let mut cls = self.pop().unwrap().unwrap_class_bytes(); |
533 | cls.union(&xcls); |
534 | self.push(HirFrame::ClassBytes(cls)); |
535 | } |
536 | } |
537 | ast::ClassSetItem::Bracketed(ref ast) => { |
538 | if self.flags().unicode() { |
539 | let mut cls1 = self.pop().unwrap().unwrap_class_unicode(); |
540 | self.unicode_fold_and_negate( |
541 | &ast.span, |
542 | ast.negated, |
543 | &mut cls1, |
544 | )?; |
545 | |
546 | let mut cls2 = self.pop().unwrap().unwrap_class_unicode(); |
547 | cls2.union(&cls1); |
548 | self.push(HirFrame::ClassUnicode(cls2)); |
549 | } else { |
550 | let mut cls1 = self.pop().unwrap().unwrap_class_bytes(); |
551 | self.bytes_fold_and_negate( |
552 | &ast.span, |
553 | ast.negated, |
554 | &mut cls1, |
555 | )?; |
556 | |
557 | let mut cls2 = self.pop().unwrap().unwrap_class_bytes(); |
558 | cls2.union(&cls1); |
559 | self.push(HirFrame::ClassBytes(cls2)); |
560 | } |
561 | } |
562 | // This is handled automatically by the visitor. |
563 | ast::ClassSetItem::Union(_) => {} |
564 | } |
565 | Ok(()) |
566 | } |
567 | |
568 | fn visit_class_set_binary_op_pre( |
569 | &mut self, |
570 | _op: &ast::ClassSetBinaryOp, |
571 | ) -> Result<()> { |
572 | if self.flags().unicode() { |
573 | let cls = hir::ClassUnicode::empty(); |
574 | self.push(HirFrame::ClassUnicode(cls)); |
575 | } else { |
576 | let cls = hir::ClassBytes::empty(); |
577 | self.push(HirFrame::ClassBytes(cls)); |
578 | } |
579 | Ok(()) |
580 | } |
581 | |
582 | fn visit_class_set_binary_op_in( |
583 | &mut self, |
584 | _op: &ast::ClassSetBinaryOp, |
585 | ) -> Result<()> { |
586 | if self.flags().unicode() { |
587 | let cls = hir::ClassUnicode::empty(); |
588 | self.push(HirFrame::ClassUnicode(cls)); |
589 | } else { |
590 | let cls = hir::ClassBytes::empty(); |
591 | self.push(HirFrame::ClassBytes(cls)); |
592 | } |
593 | Ok(()) |
594 | } |
595 | |
596 | fn visit_class_set_binary_op_post( |
597 | &mut self, |
598 | op: &ast::ClassSetBinaryOp, |
599 | ) -> Result<()> { |
600 | use crate::ast::ClassSetBinaryOpKind::*; |
601 | |
602 | if self.flags().unicode() { |
603 | let mut rhs = self.pop().unwrap().unwrap_class_unicode(); |
604 | let mut lhs = self.pop().unwrap().unwrap_class_unicode(); |
605 | let mut cls = self.pop().unwrap().unwrap_class_unicode(); |
606 | if self.flags().case_insensitive() { |
607 | rhs.try_case_fold_simple().map_err(|_| { |
608 | self.error( |
609 | op.rhs.span().clone(), |
610 | ErrorKind::UnicodeCaseUnavailable, |
611 | ) |
612 | })?; |
613 | lhs.try_case_fold_simple().map_err(|_| { |
614 | self.error( |
615 | op.lhs.span().clone(), |
616 | ErrorKind::UnicodeCaseUnavailable, |
617 | ) |
618 | })?; |
619 | } |
620 | match op.kind { |
621 | Intersection => lhs.intersect(&rhs), |
622 | Difference => lhs.difference(&rhs), |
623 | SymmetricDifference => lhs.symmetric_difference(&rhs), |
624 | } |
625 | cls.union(&lhs); |
626 | self.push(HirFrame::ClassUnicode(cls)); |
627 | } else { |
628 | let mut rhs = self.pop().unwrap().unwrap_class_bytes(); |
629 | let mut lhs = self.pop().unwrap().unwrap_class_bytes(); |
630 | let mut cls = self.pop().unwrap().unwrap_class_bytes(); |
631 | if self.flags().case_insensitive() { |
632 | rhs.case_fold_simple(); |
633 | lhs.case_fold_simple(); |
634 | } |
635 | match op.kind { |
636 | Intersection => lhs.intersect(&rhs), |
637 | Difference => lhs.difference(&rhs), |
638 | SymmetricDifference => lhs.symmetric_difference(&rhs), |
639 | } |
640 | cls.union(&lhs); |
641 | self.push(HirFrame::ClassBytes(cls)); |
642 | } |
643 | Ok(()) |
644 | } |
645 | } |
646 | |
647 | /// The internal implementation of a translator. |
648 | /// |
649 | /// This type is responsible for carrying around the original pattern string, |
650 | /// which is not tied to the internal state of a translator. |
651 | /// |
652 | /// A TranslatorI exists for the time it takes to translate a single Ast. |
653 | #[derive (Clone, Debug)] |
654 | struct TranslatorI<'t, 'p> { |
655 | trans: &'t Translator, |
656 | pattern: &'p str, |
657 | } |
658 | |
659 | impl<'t, 'p> TranslatorI<'t, 'p> { |
660 | /// Build a new internal translator. |
661 | fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> { |
662 | TranslatorI { trans, pattern } |
663 | } |
664 | |
665 | /// Return a reference to the underlying translator. |
666 | fn trans(&self) -> &Translator { |
667 | &self.trans |
668 | } |
669 | |
670 | /// Push the given frame on to the call stack. |
671 | fn push(&self, frame: HirFrame) { |
672 | self.trans().stack.borrow_mut().push(frame); |
673 | } |
674 | |
675 | /// Push the given literal char on to the call stack. |
676 | /// |
677 | /// If the top-most element of the stack is a literal, then the char |
678 | /// is appended to the end of that literal. Otherwise, a new literal |
679 | /// containing just the given char is pushed to the top of the stack. |
680 | fn push_char(&self, ch: char) { |
681 | let mut buf = [0; 4]; |
682 | let bytes = ch.encode_utf8(&mut buf).as_bytes(); |
683 | let mut stack = self.trans().stack.borrow_mut(); |
684 | if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() { |
685 | literal.extend_from_slice(bytes); |
686 | } else { |
687 | stack.push(HirFrame::Literal(bytes.to_vec())); |
688 | } |
689 | } |
690 | |
691 | /// Push the given literal byte on to the call stack. |
692 | /// |
693 | /// If the top-most element of the stack is a literal, then the byte |
694 | /// is appended to the end of that literal. Otherwise, a new literal |
695 | /// containing just the given byte is pushed to the top of the stack. |
696 | fn push_byte(&self, byte: u8) { |
697 | let mut stack = self.trans().stack.borrow_mut(); |
698 | if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() { |
699 | literal.push(byte); |
700 | } else { |
701 | stack.push(HirFrame::Literal(vec![byte])); |
702 | } |
703 | } |
704 | |
705 | /// Pop the top of the call stack. If the call stack is empty, return None. |
706 | fn pop(&self) -> Option<HirFrame> { |
707 | self.trans().stack.borrow_mut().pop() |
708 | } |
709 | |
710 | /// Pop an HIR expression from the top of the stack for a concatenation. |
711 | /// |
712 | /// This returns None if the stack is empty or when a concat frame is seen. |
713 | /// Otherwise, it panics if it could not find an HIR expression. |
714 | fn pop_concat_expr(&self) -> Option<Hir> { |
715 | let frame = self.pop()?; |
716 | match frame { |
717 | HirFrame::Concat => None, |
718 | HirFrame::Expr(expr) => Some(expr), |
719 | HirFrame::Literal(lit) => Some(Hir::literal(lit)), |
720 | HirFrame::ClassUnicode(_) => { |
721 | unreachable!("expected expr or concat, got Unicode class" ) |
722 | } |
723 | HirFrame::ClassBytes(_) => { |
724 | unreachable!("expected expr or concat, got byte class" ) |
725 | } |
726 | HirFrame::Repetition => { |
727 | unreachable!("expected expr or concat, got repetition" ) |
728 | } |
729 | HirFrame::Group { .. } => { |
730 | unreachable!("expected expr or concat, got group" ) |
731 | } |
732 | HirFrame::Alternation => { |
733 | unreachable!("expected expr or concat, got alt marker" ) |
734 | } |
735 | HirFrame::AlternationBranch => { |
736 | unreachable!("expected expr or concat, got alt branch marker" ) |
737 | } |
738 | } |
739 | } |
740 | |
741 | /// Pop an HIR expression from the top of the stack for an alternation. |
742 | /// |
743 | /// This returns None if the stack is empty or when an alternation frame is |
744 | /// seen. Otherwise, it panics if it could not find an HIR expression. |
745 | fn pop_alt_expr(&self) -> Option<Hir> { |
746 | let frame = self.pop()?; |
747 | match frame { |
748 | HirFrame::Alternation => None, |
749 | HirFrame::Expr(expr) => Some(expr), |
750 | HirFrame::Literal(lit) => Some(Hir::literal(lit)), |
751 | HirFrame::ClassUnicode(_) => { |
752 | unreachable!("expected expr or alt, got Unicode class" ) |
753 | } |
754 | HirFrame::ClassBytes(_) => { |
755 | unreachable!("expected expr or alt, got byte class" ) |
756 | } |
757 | HirFrame::Repetition => { |
758 | unreachable!("expected expr or alt, got repetition" ) |
759 | } |
760 | HirFrame::Group { .. } => { |
761 | unreachable!("expected expr or alt, got group" ) |
762 | } |
763 | HirFrame::Concat => { |
764 | unreachable!("expected expr or alt, got concat marker" ) |
765 | } |
766 | HirFrame::AlternationBranch => { |
767 | unreachable!("expected expr or alt, got alt branch marker" ) |
768 | } |
769 | } |
770 | } |
771 | |
772 | /// Create a new error with the given span and error type. |
773 | fn error(&self, span: Span, kind: ErrorKind) -> Error { |
774 | Error { kind, pattern: self.pattern.to_string(), span } |
775 | } |
776 | |
777 | /// Return a copy of the active flags. |
778 | fn flags(&self) -> Flags { |
779 | self.trans().flags.get() |
780 | } |
781 | |
782 | /// Set the flags of this translator from the flags set in the given AST. |
783 | /// Then, return the old flags. |
784 | fn set_flags(&self, ast_flags: &ast::Flags) -> Flags { |
785 | let old_flags = self.flags(); |
786 | let mut new_flags = Flags::from_ast(ast_flags); |
787 | new_flags.merge(&old_flags); |
788 | self.trans().flags.set(new_flags); |
789 | old_flags |
790 | } |
791 | |
792 | /// Convert an Ast literal to its scalar representation. |
793 | /// |
794 | /// When Unicode mode is enabled, then this always succeeds and returns a |
795 | /// `char` (Unicode scalar value). |
796 | /// |
797 | /// When Unicode mode is disabled, then a `char` will still be returned |
798 | /// whenever possible. A byte is returned only when invalid UTF-8 is |
799 | /// allowed and when the byte is not ASCII. Otherwise, a non-ASCII byte |
800 | /// will result in an error when invalid UTF-8 is not allowed. |
801 | fn ast_literal_to_scalar( |
802 | &self, |
803 | lit: &ast::Literal, |
804 | ) -> Result<Either<char, u8>> { |
805 | if self.flags().unicode() { |
806 | return Ok(Either::Left(lit.c)); |
807 | } |
808 | let byte = match lit.byte() { |
809 | None => return Ok(Either::Left(lit.c)), |
810 | Some(byte) => byte, |
811 | }; |
812 | if byte <= 0x7F { |
813 | return Ok(Either::Left(char::try_from(byte).unwrap())); |
814 | } |
815 | if self.trans().utf8 { |
816 | return Err(self.error(lit.span, ErrorKind::InvalidUtf8)); |
817 | } |
818 | Ok(Either::Right(byte)) |
819 | } |
820 | |
821 | fn case_fold_char(&self, span: Span, c: char) -> Result<Option<Hir>> { |
822 | if !self.flags().case_insensitive() { |
823 | return Ok(None); |
824 | } |
825 | if self.flags().unicode() { |
826 | // If case folding won't do anything, then don't bother trying. |
827 | let map = unicode::SimpleCaseFolder::new() |
828 | .map(|f| f.overlaps(c, c)) |
829 | .map_err(|_| { |
830 | self.error(span, ErrorKind::UnicodeCaseUnavailable) |
831 | })?; |
832 | if !map { |
833 | return Ok(None); |
834 | } |
835 | let mut cls = |
836 | hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new( |
837 | c, c, |
838 | )]); |
839 | cls.try_case_fold_simple().map_err(|_| { |
840 | self.error(span, ErrorKind::UnicodeCaseUnavailable) |
841 | })?; |
842 | Ok(Some(Hir::class(hir::Class::Unicode(cls)))) |
843 | } else { |
844 | if c.len_utf8() > 1 { |
845 | return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); |
846 | } |
847 | // If case folding won't do anything, then don't bother trying. |
848 | match c { |
849 | 'A' ..='Z' | 'a' ..='z' => {} |
850 | _ => return Ok(None), |
851 | } |
852 | let mut cls = |
853 | hir::ClassBytes::new(vec![hir::ClassBytesRange::new( |
854 | // OK because 'c.len_utf8() == 1' which in turn implies |
855 | // that 'c' is ASCII. |
856 | u8::try_from(c).unwrap(), |
857 | u8::try_from(c).unwrap(), |
858 | )]); |
859 | cls.case_fold_simple(); |
860 | Ok(Some(Hir::class(hir::Class::Bytes(cls)))) |
861 | } |
862 | } |
863 | |
864 | fn hir_dot(&self, span: Span) -> Result<Hir> { |
865 | if !self.flags().unicode() && self.trans().utf8 { |
866 | return Err(self.error(span, ErrorKind::InvalidUtf8)); |
867 | } |
868 | Ok(Hir::dot(self.flags().dot())) |
869 | } |
870 | |
871 | fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> { |
872 | let unicode = self.flags().unicode(); |
873 | let multi_line = self.flags().multi_line(); |
874 | let crlf = self.flags().crlf(); |
875 | Ok(match asst.kind { |
876 | ast::AssertionKind::StartLine => Hir::look(if multi_line { |
877 | if crlf { |
878 | hir::Look::StartCRLF |
879 | } else { |
880 | hir::Look::StartLF |
881 | } |
882 | } else { |
883 | hir::Look::Start |
884 | }), |
885 | ast::AssertionKind::EndLine => Hir::look(if multi_line { |
886 | if crlf { |
887 | hir::Look::EndCRLF |
888 | } else { |
889 | hir::Look::EndLF |
890 | } |
891 | } else { |
892 | hir::Look::End |
893 | }), |
894 | ast::AssertionKind::StartText => Hir::look(hir::Look::Start), |
895 | ast::AssertionKind::EndText => Hir::look(hir::Look::End), |
896 | ast::AssertionKind::WordBoundary => Hir::look(if unicode { |
897 | hir::Look::WordUnicode |
898 | } else { |
899 | hir::Look::WordAscii |
900 | }), |
901 | ast::AssertionKind::NotWordBoundary => Hir::look(if unicode { |
902 | hir::Look::WordUnicodeNegate |
903 | } else { |
904 | hir::Look::WordAsciiNegate |
905 | }), |
906 | }) |
907 | } |
908 | |
909 | fn hir_capture(&self, group: &ast::Group, expr: Hir) -> Hir { |
910 | let (index, name) = match group.kind { |
911 | ast::GroupKind::CaptureIndex(index) => (index, None), |
912 | ast::GroupKind::CaptureName { ref name, .. } => { |
913 | (name.index, Some(name.name.clone().into_boxed_str())) |
914 | } |
915 | // The HIR doesn't need to use non-capturing groups, since the way |
916 | // in which the data type is defined handles this automatically. |
917 | ast::GroupKind::NonCapturing(_) => return expr, |
918 | }; |
919 | Hir::capture(hir::Capture { index, name, sub: Box::new(expr) }) |
920 | } |
921 | |
922 | fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir { |
923 | let (min, max) = match rep.op.kind { |
924 | ast::RepetitionKind::ZeroOrOne => (0, Some(1)), |
925 | ast::RepetitionKind::ZeroOrMore => (0, None), |
926 | ast::RepetitionKind::OneOrMore => (1, None), |
927 | ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => { |
928 | (m, Some(m)) |
929 | } |
930 | ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => { |
931 | (m, None) |
932 | } |
933 | ast::RepetitionKind::Range(ast::RepetitionRange::Bounded( |
934 | m, |
935 | n, |
936 | )) => (m, Some(n)), |
937 | }; |
938 | let greedy = |
939 | if self.flags().swap_greed() { !rep.greedy } else { rep.greedy }; |
940 | Hir::repetition(hir::Repetition { |
941 | min, |
942 | max, |
943 | greedy, |
944 | sub: Box::new(expr), |
945 | }) |
946 | } |
947 | |
948 | fn hir_unicode_class( |
949 | &self, |
950 | ast_class: &ast::ClassUnicode, |
951 | ) -> Result<hir::ClassUnicode> { |
952 | use crate::ast::ClassUnicodeKind::*; |
953 | |
954 | if !self.flags().unicode() { |
955 | return Err( |
956 | self.error(ast_class.span, ErrorKind::UnicodeNotAllowed) |
957 | ); |
958 | } |
959 | let query = match ast_class.kind { |
960 | OneLetter(name) => ClassQuery::OneLetter(name), |
961 | Named(ref name) => ClassQuery::Binary(name), |
962 | NamedValue { ref name, ref value, .. } => ClassQuery::ByValue { |
963 | property_name: name, |
964 | property_value: value, |
965 | }, |
966 | }; |
967 | let mut result = self.convert_unicode_class_error( |
968 | &ast_class.span, |
969 | unicode::class(query), |
970 | ); |
971 | if let Ok(ref mut class) = result { |
972 | self.unicode_fold_and_negate( |
973 | &ast_class.span, |
974 | ast_class.negated, |
975 | class, |
976 | )?; |
977 | } |
978 | result |
979 | } |
980 | |
981 | fn hir_ascii_unicode_class( |
982 | &self, |
983 | ast: &ast::ClassAscii, |
984 | ) -> Result<hir::ClassUnicode> { |
985 | let mut cls = hir::ClassUnicode::new( |
986 | ascii_class_as_chars(&ast.kind) |
987 | .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), |
988 | ); |
989 | self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?; |
990 | Ok(cls) |
991 | } |
992 | |
993 | fn hir_ascii_byte_class( |
994 | &self, |
995 | ast: &ast::ClassAscii, |
996 | ) -> Result<hir::ClassBytes> { |
997 | let mut cls = hir::ClassBytes::new( |
998 | ascii_class(&ast.kind) |
999 | .map(|(s, e)| hir::ClassBytesRange::new(s, e)), |
1000 | ); |
1001 | self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?; |
1002 | Ok(cls) |
1003 | } |
1004 | |
1005 | fn hir_perl_unicode_class( |
1006 | &self, |
1007 | ast_class: &ast::ClassPerl, |
1008 | ) -> Result<hir::ClassUnicode> { |
1009 | use crate::ast::ClassPerlKind::*; |
1010 | |
1011 | assert!(self.flags().unicode()); |
1012 | let result = match ast_class.kind { |
1013 | Digit => unicode::perl_digit(), |
1014 | Space => unicode::perl_space(), |
1015 | Word => unicode::perl_word(), |
1016 | }; |
1017 | let mut class = |
1018 | self.convert_unicode_class_error(&ast_class.span, result)?; |
1019 | // We needn't apply case folding here because the Perl Unicode classes |
1020 | // are already closed under Unicode simple case folding. |
1021 | if ast_class.negated { |
1022 | class.negate(); |
1023 | } |
1024 | Ok(class) |
1025 | } |
1026 | |
1027 | fn hir_perl_byte_class( |
1028 | &self, |
1029 | ast_class: &ast::ClassPerl, |
1030 | ) -> Result<hir::ClassBytes> { |
1031 | use crate::ast::ClassPerlKind::*; |
1032 | |
1033 | assert!(!self.flags().unicode()); |
1034 | let mut class = match ast_class.kind { |
1035 | Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit), |
1036 | Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space), |
1037 | Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word), |
1038 | }; |
1039 | // We needn't apply case folding here because the Perl ASCII classes |
1040 | // are already closed (under ASCII case folding). |
1041 | if ast_class.negated { |
1042 | class.negate(); |
1043 | } |
1044 | // Negating a Perl byte class is likely to cause it to match invalid |
1045 | // UTF-8. That's only OK if the translator is configured to allow such |
1046 | // things. |
1047 | if self.trans().utf8 && !class.is_ascii() { |
1048 | return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8)); |
1049 | } |
1050 | Ok(class) |
1051 | } |
1052 | |
1053 | /// Converts the given Unicode specific error to an HIR translation error. |
1054 | /// |
1055 | /// The span given should approximate the position at which an error would |
1056 | /// occur. |
1057 | fn convert_unicode_class_error( |
1058 | &self, |
1059 | span: &Span, |
1060 | result: core::result::Result<hir::ClassUnicode, unicode::Error>, |
1061 | ) -> Result<hir::ClassUnicode> { |
1062 | result.map_err(|err| { |
1063 | let sp = span.clone(); |
1064 | match err { |
1065 | unicode::Error::PropertyNotFound => { |
1066 | self.error(sp, ErrorKind::UnicodePropertyNotFound) |
1067 | } |
1068 | unicode::Error::PropertyValueNotFound => { |
1069 | self.error(sp, ErrorKind::UnicodePropertyValueNotFound) |
1070 | } |
1071 | unicode::Error::PerlClassNotFound => { |
1072 | self.error(sp, ErrorKind::UnicodePerlClassNotFound) |
1073 | } |
1074 | } |
1075 | }) |
1076 | } |
1077 | |
1078 | fn unicode_fold_and_negate( |
1079 | &self, |
1080 | span: &Span, |
1081 | negated: bool, |
1082 | class: &mut hir::ClassUnicode, |
1083 | ) -> Result<()> { |
1084 | // Note that we must apply case folding before negation! |
1085 | // Consider `(?i)[^x]`. If we applied negation first, then |
1086 | // the result would be the character class that matched any |
1087 | // Unicode scalar value. |
1088 | if self.flags().case_insensitive() { |
1089 | class.try_case_fold_simple().map_err(|_| { |
1090 | self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable) |
1091 | })?; |
1092 | } |
1093 | if negated { |
1094 | class.negate(); |
1095 | } |
1096 | Ok(()) |
1097 | } |
1098 | |
1099 | fn bytes_fold_and_negate( |
1100 | &self, |
1101 | span: &Span, |
1102 | negated: bool, |
1103 | class: &mut hir::ClassBytes, |
1104 | ) -> Result<()> { |
1105 | // Note that we must apply case folding before negation! |
1106 | // Consider `(?i)[^x]`. If we applied negation first, then |
1107 | // the result would be the character class that matched any |
1108 | // Unicode scalar value. |
1109 | if self.flags().case_insensitive() { |
1110 | class.case_fold_simple(); |
1111 | } |
1112 | if negated { |
1113 | class.negate(); |
1114 | } |
1115 | if self.trans().utf8 && !class.is_ascii() { |
1116 | return Err(self.error(span.clone(), ErrorKind::InvalidUtf8)); |
1117 | } |
1118 | Ok(()) |
1119 | } |
1120 | |
1121 | /// Return a scalar byte value suitable for use as a literal in a byte |
1122 | /// character class. |
1123 | fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> { |
1124 | match self.ast_literal_to_scalar(ast)? { |
1125 | Either::Right(byte) => Ok(byte), |
1126 | Either::Left(ch) => { |
1127 | let cp = u32::from(ch); |
1128 | if cp <= 0x7F { |
1129 | Ok(u8::try_from(cp).unwrap()) |
1130 | } else { |
1131 | // We can't feasibly support Unicode in |
1132 | // byte oriented classes. Byte classes don't |
1133 | // do Unicode case folding. |
1134 | Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed)) |
1135 | } |
1136 | } |
1137 | } |
1138 | } |
1139 | } |
1140 | |
1141 | /// A translator's representation of a regular expression's flags at any given |
1142 | /// moment in time. |
1143 | /// |
1144 | /// Each flag can be in one of three states: absent, present but disabled or |
1145 | /// present but enabled. |
1146 | #[derive (Clone, Copy, Debug, Default)] |
1147 | struct Flags { |
1148 | case_insensitive: Option<bool>, |
1149 | multi_line: Option<bool>, |
1150 | dot_matches_new_line: Option<bool>, |
1151 | swap_greed: Option<bool>, |
1152 | unicode: Option<bool>, |
1153 | crlf: Option<bool>, |
1154 | // Note that `ignore_whitespace` is omitted here because it is handled |
1155 | // entirely in the parser. |
1156 | } |
1157 | |
1158 | impl Flags { |
1159 | fn from_ast(ast: &ast::Flags) -> Flags { |
1160 | let mut flags = Flags::default(); |
1161 | let mut enable = true; |
1162 | for item in &ast.items { |
1163 | match item.kind { |
1164 | ast::FlagsItemKind::Negation => { |
1165 | enable = false; |
1166 | } |
1167 | ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => { |
1168 | flags.case_insensitive = Some(enable); |
1169 | } |
1170 | ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => { |
1171 | flags.multi_line = Some(enable); |
1172 | } |
1173 | ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => { |
1174 | flags.dot_matches_new_line = Some(enable); |
1175 | } |
1176 | ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => { |
1177 | flags.swap_greed = Some(enable); |
1178 | } |
1179 | ast::FlagsItemKind::Flag(ast::Flag::Unicode) => { |
1180 | flags.unicode = Some(enable); |
1181 | } |
1182 | ast::FlagsItemKind::Flag(ast::Flag::CRLF) => { |
1183 | flags.crlf = Some(enable); |
1184 | } |
1185 | ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {} |
1186 | } |
1187 | } |
1188 | flags |
1189 | } |
1190 | |
1191 | fn merge(&mut self, previous: &Flags) { |
1192 | if self.case_insensitive.is_none() { |
1193 | self.case_insensitive = previous.case_insensitive; |
1194 | } |
1195 | if self.multi_line.is_none() { |
1196 | self.multi_line = previous.multi_line; |
1197 | } |
1198 | if self.dot_matches_new_line.is_none() { |
1199 | self.dot_matches_new_line = previous.dot_matches_new_line; |
1200 | } |
1201 | if self.swap_greed.is_none() { |
1202 | self.swap_greed = previous.swap_greed; |
1203 | } |
1204 | if self.unicode.is_none() { |
1205 | self.unicode = previous.unicode; |
1206 | } |
1207 | if self.crlf.is_none() { |
1208 | self.crlf = previous.crlf; |
1209 | } |
1210 | } |
1211 | |
1212 | fn dot(&self) -> hir::Dot { |
1213 | if self.dot_matches_new_line() { |
1214 | if self.unicode() { |
1215 | hir::Dot::AnyChar |
1216 | } else { |
1217 | hir::Dot::AnyByte |
1218 | } |
1219 | } else { |
1220 | if self.unicode() { |
1221 | if self.crlf() { |
1222 | hir::Dot::AnyCharExceptCRLF |
1223 | } else { |
1224 | hir::Dot::AnyCharExceptLF |
1225 | } |
1226 | } else { |
1227 | if self.crlf() { |
1228 | hir::Dot::AnyByteExceptCRLF |
1229 | } else { |
1230 | hir::Dot::AnyByteExceptLF |
1231 | } |
1232 | } |
1233 | } |
1234 | } |
1235 | |
1236 | fn case_insensitive(&self) -> bool { |
1237 | self.case_insensitive.unwrap_or(false) |
1238 | } |
1239 | |
1240 | fn multi_line(&self) -> bool { |
1241 | self.multi_line.unwrap_or(false) |
1242 | } |
1243 | |
1244 | fn dot_matches_new_line(&self) -> bool { |
1245 | self.dot_matches_new_line.unwrap_or(false) |
1246 | } |
1247 | |
1248 | fn swap_greed(&self) -> bool { |
1249 | self.swap_greed.unwrap_or(false) |
1250 | } |
1251 | |
1252 | fn unicode(&self) -> bool { |
1253 | self.unicode.unwrap_or(true) |
1254 | } |
1255 | |
1256 | fn crlf(&self) -> bool { |
1257 | self.crlf.unwrap_or(false) |
1258 | } |
1259 | } |
1260 | |
1261 | fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes { |
1262 | let ranges: Vec<_> = ascii_classimpl Iterator (kind) |
1263 | .map(|(s: u8, e: u8)| hir::ClassBytesRange::new(start:s, end:e)) |
1264 | .collect(); |
1265 | hir::ClassBytes::new(ranges) |
1266 | } |
1267 | |
1268 | fn ascii_class(kind: &ast::ClassAsciiKind) -> impl Iterator<Item = (u8, u8)> { |
1269 | use crate::ast::ClassAsciiKind::*; |
1270 | |
1271 | let slice: &'static [(u8, u8)] = match *kind { |
1272 | Alnum => &[(b'0' , b'9' ), (b'A' , b'Z' ), (b'a' , b'z' )], |
1273 | Alpha => &[(b'A' , b'Z' ), (b'a' , b'z' )], |
1274 | Ascii => &[(b' \x00' , b' \x7F' )], |
1275 | Blank => &[(b' \t' , b' \t' ), (b' ' , b' ' )], |
1276 | Cntrl => &[(b' \x00' , b' \x1F' ), (b' \x7F' , b' \x7F' )], |
1277 | Digit => &[(b'0' , b'9' )], |
1278 | Graph => &[(b'!' , b'~' )], |
1279 | Lower => &[(b'a' , b'z' )], |
1280 | Print => &[(b' ' , b'~' )], |
1281 | Punct => &[(b'!' , b'/' ), (b':' , b'@' ), (b'[' , b'`' ), (b'{' , b'~' )], |
1282 | Space => &[ |
1283 | (b' \t' , b' \t' ), |
1284 | (b' \n' , b' \n' ), |
1285 | (b' \x0B' , b' \x0B' ), |
1286 | (b' \x0C' , b' \x0C' ), |
1287 | (b' \r' , b' \r' ), |
1288 | (b' ' , b' ' ), |
1289 | ], |
1290 | Upper => &[(b'A' , b'Z' )], |
1291 | Word => &[(b'0' , b'9' ), (b'A' , b'Z' ), (b'_' , b'_' ), (b'a' , b'z' )], |
1292 | Xdigit => &[(b'0' , b'9' ), (b'A' , b'F' ), (b'a' , b'f' )], |
1293 | }; |
1294 | slice.iter().copied() |
1295 | } |
1296 | |
1297 | fn ascii_class_as_chars( |
1298 | kind: &ast::ClassAsciiKind, |
1299 | ) -> impl Iterator<Item = (char, char)> { |
1300 | ascii_class(kind).map(|(s: u8, e: u8)| (char::from(s), char::from(e))) |
1301 | } |
1302 | |
1303 | #[cfg (test)] |
1304 | mod tests { |
1305 | use crate::{ |
1306 | ast::{self, parse::ParserBuilder, Ast, Position, Span}, |
1307 | hir::{self, Hir, HirKind, Look, Properties}, |
1308 | unicode::{self, ClassQuery}, |
1309 | }; |
1310 | |
1311 | use super::*; |
1312 | |
1313 | // We create these errors to compare with real hir::Errors in the tests. |
1314 | // We define equality between TestError and hir::Error to disregard the |
1315 | // pattern string in hir::Error, which is annoying to provide in tests. |
1316 | #[derive (Clone, Debug)] |
1317 | struct TestError { |
1318 | span: Span, |
1319 | kind: hir::ErrorKind, |
1320 | } |
1321 | |
1322 | impl PartialEq<hir::Error> for TestError { |
1323 | fn eq(&self, other: &hir::Error) -> bool { |
1324 | self.span == other.span && self.kind == other.kind |
1325 | } |
1326 | } |
1327 | |
1328 | impl PartialEq<TestError> for hir::Error { |
1329 | fn eq(&self, other: &TestError) -> bool { |
1330 | self.span == other.span && self.kind == other.kind |
1331 | } |
1332 | } |
1333 | |
1334 | fn parse(pattern: &str) -> Ast { |
1335 | ParserBuilder::new().octal(true).build().parse(pattern).unwrap() |
1336 | } |
1337 | |
1338 | fn t(pattern: &str) -> Hir { |
1339 | TranslatorBuilder::new() |
1340 | .utf8(true) |
1341 | .build() |
1342 | .translate(pattern, &parse(pattern)) |
1343 | .unwrap() |
1344 | } |
1345 | |
1346 | fn t_err(pattern: &str) -> hir::Error { |
1347 | TranslatorBuilder::new() |
1348 | .utf8(true) |
1349 | .build() |
1350 | .translate(pattern, &parse(pattern)) |
1351 | .unwrap_err() |
1352 | } |
1353 | |
1354 | fn t_bytes(pattern: &str) -> Hir { |
1355 | TranslatorBuilder::new() |
1356 | .utf8(false) |
1357 | .build() |
1358 | .translate(pattern, &parse(pattern)) |
1359 | .unwrap() |
1360 | } |
1361 | |
1362 | fn props(pattern: &str) -> Properties { |
1363 | t(pattern).properties().clone() |
1364 | } |
1365 | |
1366 | fn props_bytes(pattern: &str) -> Properties { |
1367 | t_bytes(pattern).properties().clone() |
1368 | } |
1369 | |
1370 | fn hir_lit(s: &str) -> Hir { |
1371 | hir_blit(s.as_bytes()) |
1372 | } |
1373 | |
1374 | fn hir_blit(s: &[u8]) -> Hir { |
1375 | Hir::literal(s) |
1376 | } |
1377 | |
1378 | fn hir_capture(index: u32, expr: Hir) -> Hir { |
1379 | Hir::capture(hir::Capture { index, name: None, sub: Box::new(expr) }) |
1380 | } |
1381 | |
1382 | fn hir_capture_name(index: u32, name: &str, expr: Hir) -> Hir { |
1383 | Hir::capture(hir::Capture { |
1384 | index, |
1385 | name: Some(name.into()), |
1386 | sub: Box::new(expr), |
1387 | }) |
1388 | } |
1389 | |
1390 | fn hir_quest(greedy: bool, expr: Hir) -> Hir { |
1391 | Hir::repetition(hir::Repetition { |
1392 | min: 0, |
1393 | max: Some(1), |
1394 | greedy, |
1395 | sub: Box::new(expr), |
1396 | }) |
1397 | } |
1398 | |
1399 | fn hir_star(greedy: bool, expr: Hir) -> Hir { |
1400 | Hir::repetition(hir::Repetition { |
1401 | min: 0, |
1402 | max: None, |
1403 | greedy, |
1404 | sub: Box::new(expr), |
1405 | }) |
1406 | } |
1407 | |
1408 | fn hir_plus(greedy: bool, expr: Hir) -> Hir { |
1409 | Hir::repetition(hir::Repetition { |
1410 | min: 1, |
1411 | max: None, |
1412 | greedy, |
1413 | sub: Box::new(expr), |
1414 | }) |
1415 | } |
1416 | |
1417 | fn hir_range(greedy: bool, min: u32, max: Option<u32>, expr: Hir) -> Hir { |
1418 | Hir::repetition(hir::Repetition { |
1419 | min, |
1420 | max, |
1421 | greedy, |
1422 | sub: Box::new(expr), |
1423 | }) |
1424 | } |
1425 | |
1426 | fn hir_alt(alts: Vec<Hir>) -> Hir { |
1427 | Hir::alternation(alts) |
1428 | } |
1429 | |
1430 | fn hir_cat(exprs: Vec<Hir>) -> Hir { |
1431 | Hir::concat(exprs) |
1432 | } |
1433 | |
1434 | #[allow (dead_code)] |
1435 | fn hir_uclass_query(query: ClassQuery<'_>) -> Hir { |
1436 | Hir::class(hir::Class::Unicode(unicode::class(query).unwrap())) |
1437 | } |
1438 | |
1439 | #[allow (dead_code)] |
1440 | fn hir_uclass_perl_word() -> Hir { |
1441 | Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap())) |
1442 | } |
1443 | |
1444 | fn hir_ascii_uclass(kind: &ast::ClassAsciiKind) -> Hir { |
1445 | Hir::class(hir::Class::Unicode(hir::ClassUnicode::new( |
1446 | ascii_class_as_chars(kind) |
1447 | .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), |
1448 | ))) |
1449 | } |
1450 | |
1451 | fn hir_ascii_bclass(kind: &ast::ClassAsciiKind) -> Hir { |
1452 | Hir::class(hir::Class::Bytes(hir::ClassBytes::new( |
1453 | ascii_class(kind).map(|(s, e)| hir::ClassBytesRange::new(s, e)), |
1454 | ))) |
1455 | } |
1456 | |
1457 | fn hir_uclass(ranges: &[(char, char)]) -> Hir { |
1458 | Hir::class(uclass(ranges)) |
1459 | } |
1460 | |
1461 | fn hir_bclass(ranges: &[(u8, u8)]) -> Hir { |
1462 | Hir::class(bclass(ranges)) |
1463 | } |
1464 | |
1465 | fn hir_case_fold(expr: Hir) -> Hir { |
1466 | match expr.into_kind() { |
1467 | HirKind::Class(mut cls) => { |
1468 | cls.case_fold_simple(); |
1469 | Hir::class(cls) |
1470 | } |
1471 | _ => panic!("cannot case fold non-class Hir expr" ), |
1472 | } |
1473 | } |
1474 | |
1475 | fn hir_negate(expr: Hir) -> Hir { |
1476 | match expr.into_kind() { |
1477 | HirKind::Class(mut cls) => { |
1478 | cls.negate(); |
1479 | Hir::class(cls) |
1480 | } |
1481 | _ => panic!("cannot negate non-class Hir expr" ), |
1482 | } |
1483 | } |
1484 | |
1485 | fn uclass(ranges: &[(char, char)]) -> hir::Class { |
1486 | let ranges: Vec<hir::ClassUnicodeRange> = ranges |
1487 | .iter() |
1488 | .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) |
1489 | .collect(); |
1490 | hir::Class::Unicode(hir::ClassUnicode::new(ranges)) |
1491 | } |
1492 | |
1493 | fn bclass(ranges: &[(u8, u8)]) -> hir::Class { |
1494 | let ranges: Vec<hir::ClassBytesRange> = ranges |
1495 | .iter() |
1496 | .map(|&(s, e)| hir::ClassBytesRange::new(s, e)) |
1497 | .collect(); |
1498 | hir::Class::Bytes(hir::ClassBytes::new(ranges)) |
1499 | } |
1500 | |
1501 | #[cfg (feature = "unicode-case" )] |
1502 | fn class_case_fold(mut cls: hir::Class) -> Hir { |
1503 | cls.case_fold_simple(); |
1504 | Hir::class(cls) |
1505 | } |
1506 | |
1507 | fn class_negate(mut cls: hir::Class) -> Hir { |
1508 | cls.negate(); |
1509 | Hir::class(cls) |
1510 | } |
1511 | |
1512 | #[allow (dead_code)] |
1513 | fn hir_union(expr1: Hir, expr2: Hir) -> Hir { |
1514 | use crate::hir::Class::{Bytes, Unicode}; |
1515 | |
1516 | match (expr1.into_kind(), expr2.into_kind()) { |
1517 | (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => { |
1518 | c1.union(&c2); |
1519 | Hir::class(hir::Class::Unicode(c1)) |
1520 | } |
1521 | (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => { |
1522 | c1.union(&c2); |
1523 | Hir::class(hir::Class::Bytes(c1)) |
1524 | } |
1525 | _ => panic!("cannot union non-class Hir exprs" ), |
1526 | } |
1527 | } |
1528 | |
1529 | #[allow (dead_code)] |
1530 | fn hir_difference(expr1: Hir, expr2: Hir) -> Hir { |
1531 | use crate::hir::Class::{Bytes, Unicode}; |
1532 | |
1533 | match (expr1.into_kind(), expr2.into_kind()) { |
1534 | (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => { |
1535 | c1.difference(&c2); |
1536 | Hir::class(hir::Class::Unicode(c1)) |
1537 | } |
1538 | (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => { |
1539 | c1.difference(&c2); |
1540 | Hir::class(hir::Class::Bytes(c1)) |
1541 | } |
1542 | _ => panic!("cannot difference non-class Hir exprs" ), |
1543 | } |
1544 | } |
1545 | |
1546 | fn hir_look(look: hir::Look) -> Hir { |
1547 | Hir::look(look) |
1548 | } |
1549 | |
1550 | #[test ] |
1551 | fn empty() { |
1552 | assert_eq!(t("" ), Hir::empty()); |
1553 | assert_eq!(t("(?i)" ), Hir::empty()); |
1554 | assert_eq!(t("()" ), hir_capture(1, Hir::empty())); |
1555 | assert_eq!(t("(?:)" ), Hir::empty()); |
1556 | assert_eq!(t("(?P<wat>)" ), hir_capture_name(1, "wat" , Hir::empty())); |
1557 | assert_eq!(t("|" ), hir_alt(vec![Hir::empty(), Hir::empty()])); |
1558 | assert_eq!( |
1559 | t("()|()" ), |
1560 | hir_alt(vec![ |
1561 | hir_capture(1, Hir::empty()), |
1562 | hir_capture(2, Hir::empty()), |
1563 | ]) |
1564 | ); |
1565 | assert_eq!( |
1566 | t("(|b)" ), |
1567 | hir_capture(1, hir_alt(vec![Hir::empty(), hir_lit("b" ),])) |
1568 | ); |
1569 | assert_eq!( |
1570 | t("(a|)" ), |
1571 | hir_capture(1, hir_alt(vec![hir_lit("a" ), Hir::empty(),])) |
1572 | ); |
1573 | assert_eq!( |
1574 | t("(a||c)" ), |
1575 | hir_capture( |
1576 | 1, |
1577 | hir_alt(vec![hir_lit("a" ), Hir::empty(), hir_lit("c" ),]) |
1578 | ) |
1579 | ); |
1580 | assert_eq!( |
1581 | t("(||)" ), |
1582 | hir_capture( |
1583 | 1, |
1584 | hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),]) |
1585 | ) |
1586 | ); |
1587 | } |
1588 | |
1589 | #[test ] |
1590 | fn literal() { |
1591 | assert_eq!(t("a" ), hir_lit("a" )); |
1592 | assert_eq!(t("(?-u)a" ), hir_lit("a" )); |
1593 | assert_eq!(t("☃" ), hir_lit("☃" )); |
1594 | assert_eq!(t("abcd" ), hir_lit("abcd" )); |
1595 | |
1596 | assert_eq!(t_bytes("(?-u)a" ), hir_lit("a" )); |
1597 | assert_eq!(t_bytes("(?-u) \x61" ), hir_lit("a" )); |
1598 | assert_eq!(t_bytes(r"(?-u)\x61" ), hir_lit("a" )); |
1599 | assert_eq!(t_bytes(r"(?-u)\xFF" ), hir_blit(b" \xFF" )); |
1600 | |
1601 | assert_eq!( |
1602 | t_err("(?-u)☃" ), |
1603 | TestError { |
1604 | kind: hir::ErrorKind::UnicodeNotAllowed, |
1605 | span: Span::new( |
1606 | Position::new(5, 1, 6), |
1607 | Position::new(8, 1, 7) |
1608 | ), |
1609 | } |
1610 | ); |
1611 | assert_eq!( |
1612 | t_err(r"(?-u)\xFF" ), |
1613 | TestError { |
1614 | kind: hir::ErrorKind::InvalidUtf8, |
1615 | span: Span::new( |
1616 | Position::new(5, 1, 6), |
1617 | Position::new(9, 1, 10) |
1618 | ), |
1619 | } |
1620 | ); |
1621 | } |
1622 | |
1623 | #[test ] |
1624 | fn literal_case_insensitive() { |
1625 | #[cfg (feature = "unicode-case" )] |
1626 | assert_eq!(t("(?i)a" ), hir_uclass(&[('A' , 'A' ), ('a' , 'a' ),])); |
1627 | #[cfg (feature = "unicode-case" )] |
1628 | assert_eq!(t("(?i:a)" ), hir_uclass(&[('A' , 'A' ), ('a' , 'a' )])); |
1629 | #[cfg (feature = "unicode-case" )] |
1630 | assert_eq!( |
1631 | t("a(?i)a(?-i)a" ), |
1632 | hir_cat(vec![ |
1633 | hir_lit("a" ), |
1634 | hir_uclass(&[('A' , 'A' ), ('a' , 'a' )]), |
1635 | hir_lit("a" ), |
1636 | ]) |
1637 | ); |
1638 | #[cfg (feature = "unicode-case" )] |
1639 | assert_eq!( |
1640 | t("(?i)ab@c" ), |
1641 | hir_cat(vec![ |
1642 | hir_uclass(&[('A' , 'A' ), ('a' , 'a' )]), |
1643 | hir_uclass(&[('B' , 'B' ), ('b' , 'b' )]), |
1644 | hir_lit("@" ), |
1645 | hir_uclass(&[('C' , 'C' ), ('c' , 'c' )]), |
1646 | ]) |
1647 | ); |
1648 | #[cfg (feature = "unicode-case" )] |
1649 | assert_eq!( |
1650 | t("(?i)β" ), |
1651 | hir_uclass(&[('Β' , 'Β' ), ('β' , 'β' ), ('ϐ' , 'ϐ' ),]) |
1652 | ); |
1653 | |
1654 | assert_eq!(t("(?i-u)a" ), hir_bclass(&[(b'A' , b'A' ), (b'a' , b'a' ),])); |
1655 | #[cfg (feature = "unicode-case" )] |
1656 | assert_eq!( |
1657 | t("(?-u)a(?i)a(?-i)a" ), |
1658 | hir_cat(vec![ |
1659 | hir_lit("a" ), |
1660 | hir_bclass(&[(b'A' , b'A' ), (b'a' , b'a' )]), |
1661 | hir_lit("a" ), |
1662 | ]) |
1663 | ); |
1664 | assert_eq!( |
1665 | t("(?i-u)ab@c" ), |
1666 | hir_cat(vec![ |
1667 | hir_bclass(&[(b'A' , b'A' ), (b'a' , b'a' )]), |
1668 | hir_bclass(&[(b'B' , b'B' ), (b'b' , b'b' )]), |
1669 | hir_lit("@" ), |
1670 | hir_bclass(&[(b'C' , b'C' ), (b'c' , b'c' )]), |
1671 | ]) |
1672 | ); |
1673 | |
1674 | assert_eq!( |
1675 | t_bytes("(?i-u)a" ), |
1676 | hir_bclass(&[(b'A' , b'A' ), (b'a' , b'a' ),]) |
1677 | ); |
1678 | assert_eq!( |
1679 | t_bytes("(?i-u) \x61" ), |
1680 | hir_bclass(&[(b'A' , b'A' ), (b'a' , b'a' ),]) |
1681 | ); |
1682 | assert_eq!( |
1683 | t_bytes(r"(?i-u)\x61" ), |
1684 | hir_bclass(&[(b'A' , b'A' ), (b'a' , b'a' ),]) |
1685 | ); |
1686 | assert_eq!(t_bytes(r"(?i-u)\xFF" ), hir_blit(b" \xFF" )); |
1687 | |
1688 | assert_eq!( |
1689 | t_err("(?i-u)β" ), |
1690 | TestError { |
1691 | kind: hir::ErrorKind::UnicodeNotAllowed, |
1692 | span: Span::new( |
1693 | Position::new(6, 1, 7), |
1694 | Position::new(8, 1, 8), |
1695 | ), |
1696 | } |
1697 | ); |
1698 | } |
1699 | |
1700 | #[test ] |
1701 | fn dot() { |
1702 | assert_eq!( |
1703 | t("." ), |
1704 | hir_uclass(&[(' \0' , ' \t' ), (' \x0B' , ' \u{10FFFF}' )]) |
1705 | ); |
1706 | assert_eq!( |
1707 | t("(?R)." ), |
1708 | hir_uclass(&[ |
1709 | (' \0' , ' \t' ), |
1710 | (' \x0B' , ' \x0C' ), |
1711 | (' \x0E' , ' \u{10FFFF}' ), |
1712 | ]) |
1713 | ); |
1714 | assert_eq!(t("(?s)." ), hir_uclass(&[(' \0' , ' \u{10FFFF}' )])); |
1715 | assert_eq!(t("(?Rs)." ), hir_uclass(&[(' \0' , ' \u{10FFFF}' )])); |
1716 | assert_eq!( |
1717 | t_bytes("(?-u)." ), |
1718 | hir_bclass(&[(b' \0' , b' \t' ), (b' \x0B' , b' \xFF' )]) |
1719 | ); |
1720 | assert_eq!( |
1721 | t_bytes("(?R-u)." ), |
1722 | hir_bclass(&[ |
1723 | (b' \0' , b' \t' ), |
1724 | (b' \x0B' , b' \x0C' ), |
1725 | (b' \x0E' , b' \xFF' ), |
1726 | ]) |
1727 | ); |
1728 | assert_eq!(t_bytes("(?s-u)." ), hir_bclass(&[(b' \0' , b' \xFF' ),])); |
1729 | assert_eq!(t_bytes("(?Rs-u)." ), hir_bclass(&[(b' \0' , b' \xFF' ),])); |
1730 | |
1731 | // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed. |
1732 | assert_eq!( |
1733 | t_err("(?-u)." ), |
1734 | TestError { |
1735 | kind: hir::ErrorKind::InvalidUtf8, |
1736 | span: Span::new( |
1737 | Position::new(5, 1, 6), |
1738 | Position::new(6, 1, 7) |
1739 | ), |
1740 | } |
1741 | ); |
1742 | assert_eq!( |
1743 | t_err("(?R-u)." ), |
1744 | TestError { |
1745 | kind: hir::ErrorKind::InvalidUtf8, |
1746 | span: Span::new( |
1747 | Position::new(6, 1, 7), |
1748 | Position::new(7, 1, 8) |
1749 | ), |
1750 | } |
1751 | ); |
1752 | assert_eq!( |
1753 | t_err("(?s-u)." ), |
1754 | TestError { |
1755 | kind: hir::ErrorKind::InvalidUtf8, |
1756 | span: Span::new( |
1757 | Position::new(6, 1, 7), |
1758 | Position::new(7, 1, 8) |
1759 | ), |
1760 | } |
1761 | ); |
1762 | assert_eq!( |
1763 | t_err("(?Rs-u)." ), |
1764 | TestError { |
1765 | kind: hir::ErrorKind::InvalidUtf8, |
1766 | span: Span::new( |
1767 | Position::new(7, 1, 8), |
1768 | Position::new(8, 1, 9) |
1769 | ), |
1770 | } |
1771 | ); |
1772 | } |
1773 | |
1774 | #[test ] |
1775 | fn assertions() { |
1776 | assert_eq!(t("^" ), hir_look(hir::Look::Start)); |
1777 | assert_eq!(t("$" ), hir_look(hir::Look::End)); |
1778 | assert_eq!(t(r"\A" ), hir_look(hir::Look::Start)); |
1779 | assert_eq!(t(r"\z" ), hir_look(hir::Look::End)); |
1780 | assert_eq!(t("(?m)^" ), hir_look(hir::Look::StartLF)); |
1781 | assert_eq!(t("(?m)$" ), hir_look(hir::Look::EndLF)); |
1782 | assert_eq!(t(r"(?m)\A" ), hir_look(hir::Look::Start)); |
1783 | assert_eq!(t(r"(?m)\z" ), hir_look(hir::Look::End)); |
1784 | |
1785 | assert_eq!(t(r"\b" ), hir_look(hir::Look::WordUnicode)); |
1786 | assert_eq!(t(r"\B" ), hir_look(hir::Look::WordUnicodeNegate)); |
1787 | assert_eq!(t(r"(?-u)\b" ), hir_look(hir::Look::WordAscii)); |
1788 | assert_eq!(t(r"(?-u)\B" ), hir_look(hir::Look::WordAsciiNegate)); |
1789 | } |
1790 | |
1791 | #[test ] |
1792 | fn group() { |
1793 | assert_eq!(t("(a)" ), hir_capture(1, hir_lit("a" ))); |
1794 | assert_eq!( |
1795 | t("(a)(b)" ), |
1796 | hir_cat(vec![ |
1797 | hir_capture(1, hir_lit("a" )), |
1798 | hir_capture(2, hir_lit("b" )), |
1799 | ]) |
1800 | ); |
1801 | assert_eq!( |
1802 | t("(a)|(b)" ), |
1803 | hir_alt(vec![ |
1804 | hir_capture(1, hir_lit("a" )), |
1805 | hir_capture(2, hir_lit("b" )), |
1806 | ]) |
1807 | ); |
1808 | assert_eq!(t("(?P<foo>)" ), hir_capture_name(1, "foo" , Hir::empty())); |
1809 | assert_eq!(t("(?P<foo>a)" ), hir_capture_name(1, "foo" , hir_lit("a" ))); |
1810 | assert_eq!( |
1811 | t("(?P<foo>a)(?P<bar>b)" ), |
1812 | hir_cat(vec![ |
1813 | hir_capture_name(1, "foo" , hir_lit("a" )), |
1814 | hir_capture_name(2, "bar" , hir_lit("b" )), |
1815 | ]) |
1816 | ); |
1817 | assert_eq!(t("(?:)" ), Hir::empty()); |
1818 | assert_eq!(t("(?:a)" ), hir_lit("a" )); |
1819 | assert_eq!( |
1820 | t("(?:a)(b)" ), |
1821 | hir_cat(vec![hir_lit("a" ), hir_capture(1, hir_lit("b" )),]) |
1822 | ); |
1823 | assert_eq!( |
1824 | t("(a)(?:b)(c)" ), |
1825 | hir_cat(vec![ |
1826 | hir_capture(1, hir_lit("a" )), |
1827 | hir_lit("b" ), |
1828 | hir_capture(2, hir_lit("c" )), |
1829 | ]) |
1830 | ); |
1831 | assert_eq!( |
1832 | t("(a)(?P<foo>b)(c)" ), |
1833 | hir_cat(vec![ |
1834 | hir_capture(1, hir_lit("a" )), |
1835 | hir_capture_name(2, "foo" , hir_lit("b" )), |
1836 | hir_capture(3, hir_lit("c" )), |
1837 | ]) |
1838 | ); |
1839 | assert_eq!(t("()" ), hir_capture(1, Hir::empty())); |
1840 | assert_eq!(t("((?i))" ), hir_capture(1, Hir::empty())); |
1841 | assert_eq!(t("((?x))" ), hir_capture(1, Hir::empty())); |
1842 | assert_eq!( |
1843 | t("(((?x)))" ), |
1844 | hir_capture(1, hir_capture(2, Hir::empty())) |
1845 | ); |
1846 | } |
1847 | |
1848 | #[test ] |
1849 | fn line_anchors() { |
1850 | assert_eq!(t("^" ), hir_look(hir::Look::Start)); |
1851 | assert_eq!(t("$" ), hir_look(hir::Look::End)); |
1852 | assert_eq!(t(r"\A" ), hir_look(hir::Look::Start)); |
1853 | assert_eq!(t(r"\z" ), hir_look(hir::Look::End)); |
1854 | |
1855 | assert_eq!(t(r"(?m)\A" ), hir_look(hir::Look::Start)); |
1856 | assert_eq!(t(r"(?m)\z" ), hir_look(hir::Look::End)); |
1857 | assert_eq!(t("(?m)^" ), hir_look(hir::Look::StartLF)); |
1858 | assert_eq!(t("(?m)$" ), hir_look(hir::Look::EndLF)); |
1859 | |
1860 | assert_eq!(t(r"(?R)\A" ), hir_look(hir::Look::Start)); |
1861 | assert_eq!(t(r"(?R)\z" ), hir_look(hir::Look::End)); |
1862 | assert_eq!(t("(?R)^" ), hir_look(hir::Look::Start)); |
1863 | assert_eq!(t("(?R)$" ), hir_look(hir::Look::End)); |
1864 | |
1865 | assert_eq!(t(r"(?Rm)\A" ), hir_look(hir::Look::Start)); |
1866 | assert_eq!(t(r"(?Rm)\z" ), hir_look(hir::Look::End)); |
1867 | assert_eq!(t("(?Rm)^" ), hir_look(hir::Look::StartCRLF)); |
1868 | assert_eq!(t("(?Rm)$" ), hir_look(hir::Look::EndCRLF)); |
1869 | } |
1870 | |
1871 | #[test ] |
1872 | fn flags() { |
1873 | #[cfg (feature = "unicode-case" )] |
1874 | assert_eq!( |
1875 | t("(?i:a)a" ), |
1876 | hir_cat( |
1877 | vec![hir_uclass(&[('A' , 'A' ), ('a' , 'a' )]), hir_lit("a" ),] |
1878 | ) |
1879 | ); |
1880 | assert_eq!( |
1881 | t("(?i-u:a)β" ), |
1882 | hir_cat(vec![ |
1883 | hir_bclass(&[(b'A' , b'A' ), (b'a' , b'a' )]), |
1884 | hir_lit("β" ), |
1885 | ]) |
1886 | ); |
1887 | assert_eq!( |
1888 | t("(?:(?i-u)a)b" ), |
1889 | hir_cat(vec![ |
1890 | hir_bclass(&[(b'A' , b'A' ), (b'a' , b'a' )]), |
1891 | hir_lit("b" ), |
1892 | ]) |
1893 | ); |
1894 | assert_eq!( |
1895 | t("((?i-u)a)b" ), |
1896 | hir_cat(vec![ |
1897 | hir_capture(1, hir_bclass(&[(b'A' , b'A' ), (b'a' , b'a' )])), |
1898 | hir_lit("b" ), |
1899 | ]) |
1900 | ); |
1901 | #[cfg (feature = "unicode-case" )] |
1902 | assert_eq!( |
1903 | t("(?i)(?-i:a)a" ), |
1904 | hir_cat( |
1905 | vec![hir_lit("a" ), hir_uclass(&[('A' , 'A' ), ('a' , 'a' )]),] |
1906 | ) |
1907 | ); |
1908 | #[cfg (feature = "unicode-case" )] |
1909 | assert_eq!( |
1910 | t("(?im)a^" ), |
1911 | hir_cat(vec![ |
1912 | hir_uclass(&[('A' , 'A' ), ('a' , 'a' )]), |
1913 | hir_look(hir::Look::StartLF), |
1914 | ]) |
1915 | ); |
1916 | #[cfg (feature = "unicode-case" )] |
1917 | assert_eq!( |
1918 | t("(?im)a^(?i-m)a^" ), |
1919 | hir_cat(vec![ |
1920 | hir_uclass(&[('A' , 'A' ), ('a' , 'a' )]), |
1921 | hir_look(hir::Look::StartLF), |
1922 | hir_uclass(&[('A' , 'A' ), ('a' , 'a' )]), |
1923 | hir_look(hir::Look::Start), |
1924 | ]) |
1925 | ); |
1926 | assert_eq!( |
1927 | t("(?U)a*a*?(?-U)a*a*?" ), |
1928 | hir_cat(vec![ |
1929 | hir_star(false, hir_lit("a" )), |
1930 | hir_star(true, hir_lit("a" )), |
1931 | hir_star(true, hir_lit("a" )), |
1932 | hir_star(false, hir_lit("a" )), |
1933 | ]) |
1934 | ); |
1935 | #[cfg (feature = "unicode-case" )] |
1936 | assert_eq!( |
1937 | t("(?:a(?i)a)a" ), |
1938 | hir_cat(vec![ |
1939 | hir_cat(vec![ |
1940 | hir_lit("a" ), |
1941 | hir_uclass(&[('A' , 'A' ), ('a' , 'a' )]), |
1942 | ]), |
1943 | hir_lit("a" ), |
1944 | ]) |
1945 | ); |
1946 | #[cfg (feature = "unicode-case" )] |
1947 | assert_eq!( |
1948 | t("(?i)(?:a(?-i)a)a" ), |
1949 | hir_cat(vec![ |
1950 | hir_cat(vec![ |
1951 | hir_uclass(&[('A' , 'A' ), ('a' , 'a' )]), |
1952 | hir_lit("a" ), |
1953 | ]), |
1954 | hir_uclass(&[('A' , 'A' ), ('a' , 'a' )]), |
1955 | ]) |
1956 | ); |
1957 | } |
1958 | |
1959 | #[test ] |
1960 | fn escape() { |
1961 | assert_eq!( |
1962 | t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#" ), |
1963 | hir_lit(r"\.+*?()|[]{}^$#" ) |
1964 | ); |
1965 | } |
1966 | |
1967 | #[test ] |
1968 | fn repetition() { |
1969 | assert_eq!(t("a?" ), hir_quest(true, hir_lit("a" ))); |
1970 | assert_eq!(t("a*" ), hir_star(true, hir_lit("a" ))); |
1971 | assert_eq!(t("a+" ), hir_plus(true, hir_lit("a" ))); |
1972 | assert_eq!(t("a??" ), hir_quest(false, hir_lit("a" ))); |
1973 | assert_eq!(t("a*?" ), hir_star(false, hir_lit("a" ))); |
1974 | assert_eq!(t("a+?" ), hir_plus(false, hir_lit("a" ))); |
1975 | |
1976 | assert_eq!(t("a{1}" ), hir_range(true, 1, Some(1), hir_lit("a" ),)); |
1977 | assert_eq!(t("a{1,}" ), hir_range(true, 1, None, hir_lit("a" ),)); |
1978 | assert_eq!(t("a{1,2}" ), hir_range(true, 1, Some(2), hir_lit("a" ),)); |
1979 | assert_eq!(t("a{1}?" ), hir_range(false, 1, Some(1), hir_lit("a" ),)); |
1980 | assert_eq!(t("a{1,}?" ), hir_range(false, 1, None, hir_lit("a" ),)); |
1981 | assert_eq!(t("a{1,2}?" ), hir_range(false, 1, Some(2), hir_lit("a" ),)); |
1982 | |
1983 | assert_eq!( |
1984 | t("ab?" ), |
1985 | hir_cat(vec![hir_lit("a" ), hir_quest(true, hir_lit("b" )),]) |
1986 | ); |
1987 | assert_eq!(t("(ab)?" ), hir_quest(true, hir_capture(1, hir_lit("ab" )))); |
1988 | assert_eq!( |
1989 | t("a|b?" ), |
1990 | hir_alt(vec![hir_lit("a" ), hir_quest(true, hir_lit("b" )),]) |
1991 | ); |
1992 | } |
1993 | |
1994 | #[test ] |
1995 | fn cat_alt() { |
1996 | let a = || hir_look(hir::Look::Start); |
1997 | let b = || hir_look(hir::Look::End); |
1998 | let c = || hir_look(hir::Look::WordUnicode); |
1999 | let d = || hir_look(hir::Look::WordUnicodeNegate); |
2000 | |
2001 | assert_eq!(t("(^$)" ), hir_capture(1, hir_cat(vec![a(), b()]))); |
2002 | assert_eq!(t("^|$" ), hir_alt(vec![a(), b()])); |
2003 | assert_eq!(t(r"^|$|\b" ), hir_alt(vec![a(), b(), c()])); |
2004 | assert_eq!( |
2005 | t(r"^$|$\b|\b\B" ), |
2006 | hir_alt(vec![ |
2007 | hir_cat(vec![a(), b()]), |
2008 | hir_cat(vec![b(), c()]), |
2009 | hir_cat(vec![c(), d()]), |
2010 | ]) |
2011 | ); |
2012 | assert_eq!(t("(^|$)" ), hir_capture(1, hir_alt(vec![a(), b()]))); |
2013 | assert_eq!( |
2014 | t(r"(^|$|\b)" ), |
2015 | hir_capture(1, hir_alt(vec![a(), b(), c()])) |
2016 | ); |
2017 | assert_eq!( |
2018 | t(r"(^$|$\b|\b\B)" ), |
2019 | hir_capture( |
2020 | 1, |
2021 | hir_alt(vec![ |
2022 | hir_cat(vec![a(), b()]), |
2023 | hir_cat(vec![b(), c()]), |
2024 | hir_cat(vec![c(), d()]), |
2025 | ]) |
2026 | ) |
2027 | ); |
2028 | assert_eq!( |
2029 | t(r"(^$|($\b|(\b\B)))" ), |
2030 | hir_capture( |
2031 | 1, |
2032 | hir_alt(vec![ |
2033 | hir_cat(vec![a(), b()]), |
2034 | hir_capture( |
2035 | 2, |
2036 | hir_alt(vec![ |
2037 | hir_cat(vec![b(), c()]), |
2038 | hir_capture(3, hir_cat(vec![c(), d()])), |
2039 | ]) |
2040 | ), |
2041 | ]) |
2042 | ) |
2043 | ); |
2044 | } |
2045 | |
2046 | // Tests the HIR transformation of things like '[a-z]|[A-Z]' into |
2047 | // '[A-Za-z]'. In other words, an alternation of just classes is always |
2048 | // equivalent to a single class corresponding to the union of the branches |
2049 | // in that class. (Unless some branches match invalid UTF-8 and others |
2050 | // match non-ASCII Unicode.) |
2051 | #[test ] |
2052 | fn cat_class_flattened() { |
2053 | assert_eq!(t(r"[a-z]|[A-Z]" ), hir_uclass(&[('A' , 'Z' ), ('a' , 'z' )])); |
2054 | // Combining all of the letter properties should give us the one giant |
2055 | // letter property. |
2056 | #[cfg (feature = "unicode-gencat" )] |
2057 | assert_eq!( |
2058 | t(r"(?x) |
2059 | \p{Lowercase_Letter} |
2060 | |\p{Uppercase_Letter} |
2061 | |\p{Titlecase_Letter} |
2062 | |\p{Modifier_Letter} |
2063 | |\p{Other_Letter} |
2064 | " ), |
2065 | hir_uclass_query(ClassQuery::Binary("letter" )) |
2066 | ); |
2067 | // Byte classes that can truly match invalid UTF-8 cannot be combined |
2068 | // with Unicode classes. |
2069 | assert_eq!( |
2070 | t_bytes(r"[Δδ]|(?-u:[\x90-\xFF])|[Λλ]" ), |
2071 | hir_alt(vec![ |
2072 | hir_uclass(&[('Δ' , 'Δ' ), ('δ' , 'δ' )]), |
2073 | hir_bclass(&[(b' \x90' , b' \xFF' )]), |
2074 | hir_uclass(&[('Λ' , 'Λ' ), ('λ' , 'λ' )]), |
2075 | ]) |
2076 | ); |
2077 | // Byte classes on their own can be combined, even if some are ASCII |
2078 | // and others are invalid UTF-8. |
2079 | assert_eq!( |
2080 | t_bytes(r"[a-z]|(?-u:[\x90-\xFF])|[A-Z]" ), |
2081 | hir_bclass(&[(b'A' , b'Z' ), (b'a' , b'z' ), (b' \x90' , b' \xFF' )]), |
2082 | ); |
2083 | } |
2084 | |
2085 | #[test ] |
2086 | fn class_ascii() { |
2087 | assert_eq!( |
2088 | t("[[:alnum:]]" ), |
2089 | hir_ascii_uclass(&ast::ClassAsciiKind::Alnum) |
2090 | ); |
2091 | assert_eq!( |
2092 | t("[[:alpha:]]" ), |
2093 | hir_ascii_uclass(&ast::ClassAsciiKind::Alpha) |
2094 | ); |
2095 | assert_eq!( |
2096 | t("[[:ascii:]]" ), |
2097 | hir_ascii_uclass(&ast::ClassAsciiKind::Ascii) |
2098 | ); |
2099 | assert_eq!( |
2100 | t("[[:blank:]]" ), |
2101 | hir_ascii_uclass(&ast::ClassAsciiKind::Blank) |
2102 | ); |
2103 | assert_eq!( |
2104 | t("[[:cntrl:]]" ), |
2105 | hir_ascii_uclass(&ast::ClassAsciiKind::Cntrl) |
2106 | ); |
2107 | assert_eq!( |
2108 | t("[[:digit:]]" ), |
2109 | hir_ascii_uclass(&ast::ClassAsciiKind::Digit) |
2110 | ); |
2111 | assert_eq!( |
2112 | t("[[:graph:]]" ), |
2113 | hir_ascii_uclass(&ast::ClassAsciiKind::Graph) |
2114 | ); |
2115 | assert_eq!( |
2116 | t("[[:lower:]]" ), |
2117 | hir_ascii_uclass(&ast::ClassAsciiKind::Lower) |
2118 | ); |
2119 | assert_eq!( |
2120 | t("[[:print:]]" ), |
2121 | hir_ascii_uclass(&ast::ClassAsciiKind::Print) |
2122 | ); |
2123 | assert_eq!( |
2124 | t("[[:punct:]]" ), |
2125 | hir_ascii_uclass(&ast::ClassAsciiKind::Punct) |
2126 | ); |
2127 | assert_eq!( |
2128 | t("[[:space:]]" ), |
2129 | hir_ascii_uclass(&ast::ClassAsciiKind::Space) |
2130 | ); |
2131 | assert_eq!( |
2132 | t("[[:upper:]]" ), |
2133 | hir_ascii_uclass(&ast::ClassAsciiKind::Upper) |
2134 | ); |
2135 | assert_eq!( |
2136 | t("[[:word:]]" ), |
2137 | hir_ascii_uclass(&ast::ClassAsciiKind::Word) |
2138 | ); |
2139 | assert_eq!( |
2140 | t("[[:xdigit:]]" ), |
2141 | hir_ascii_uclass(&ast::ClassAsciiKind::Xdigit) |
2142 | ); |
2143 | |
2144 | assert_eq!( |
2145 | t("[[:^lower:]]" ), |
2146 | hir_negate(hir_ascii_uclass(&ast::ClassAsciiKind::Lower)) |
2147 | ); |
2148 | #[cfg (feature = "unicode-case" )] |
2149 | assert_eq!( |
2150 | t("(?i)[[:lower:]]" ), |
2151 | hir_uclass(&[ |
2152 | ('A' , 'Z' ), |
2153 | ('a' , 'z' ), |
2154 | (' \u{17F}' , ' \u{17F}' ), |
2155 | (' \u{212A}' , ' \u{212A}' ), |
2156 | ]) |
2157 | ); |
2158 | |
2159 | assert_eq!( |
2160 | t("(?-u)[[:lower:]]" ), |
2161 | hir_ascii_bclass(&ast::ClassAsciiKind::Lower) |
2162 | ); |
2163 | assert_eq!( |
2164 | t("(?i-u)[[:lower:]]" ), |
2165 | hir_case_fold(hir_ascii_bclass(&ast::ClassAsciiKind::Lower)) |
2166 | ); |
2167 | |
2168 | assert_eq!( |
2169 | t_err("(?-u)[[:^lower:]]" ), |
2170 | TestError { |
2171 | kind: hir::ErrorKind::InvalidUtf8, |
2172 | span: Span::new( |
2173 | Position::new(6, 1, 7), |
2174 | Position::new(16, 1, 17) |
2175 | ), |
2176 | } |
2177 | ); |
2178 | assert_eq!( |
2179 | t_err("(?i-u)[[:^lower:]]" ), |
2180 | TestError { |
2181 | kind: hir::ErrorKind::InvalidUtf8, |
2182 | span: Span::new( |
2183 | Position::new(7, 1, 8), |
2184 | Position::new(17, 1, 18) |
2185 | ), |
2186 | } |
2187 | ); |
2188 | } |
2189 | |
2190 | #[test ] |
2191 | fn class_ascii_multiple() { |
2192 | // See: https://github.com/rust-lang/regex/issues/680 |
2193 | assert_eq!( |
2194 | t("[[:alnum:][:^ascii:]]" ), |
2195 | hir_union( |
2196 | hir_ascii_uclass(&ast::ClassAsciiKind::Alnum), |
2197 | hir_uclass(&[(' \u{80}' , ' \u{10FFFF}' )]), |
2198 | ), |
2199 | ); |
2200 | assert_eq!( |
2201 | t_bytes("(?-u)[[:alnum:][:^ascii:]]" ), |
2202 | hir_union( |
2203 | hir_ascii_bclass(&ast::ClassAsciiKind::Alnum), |
2204 | hir_bclass(&[(0x80, 0xFF)]), |
2205 | ), |
2206 | ); |
2207 | } |
2208 | |
2209 | #[test ] |
2210 | #[cfg (feature = "unicode-perl" )] |
2211 | fn class_perl_unicode() { |
2212 | // Unicode |
2213 | assert_eq!(t(r"\d" ), hir_uclass_query(ClassQuery::Binary("digit" ))); |
2214 | assert_eq!(t(r"\s" ), hir_uclass_query(ClassQuery::Binary("space" ))); |
2215 | assert_eq!(t(r"\w" ), hir_uclass_perl_word()); |
2216 | #[cfg (feature = "unicode-case" )] |
2217 | assert_eq!( |
2218 | t(r"(?i)\d" ), |
2219 | hir_uclass_query(ClassQuery::Binary("digit" )) |
2220 | ); |
2221 | #[cfg (feature = "unicode-case" )] |
2222 | assert_eq!( |
2223 | t(r"(?i)\s" ), |
2224 | hir_uclass_query(ClassQuery::Binary("space" )) |
2225 | ); |
2226 | #[cfg (feature = "unicode-case" )] |
2227 | assert_eq!(t(r"(?i)\w" ), hir_uclass_perl_word()); |
2228 | |
2229 | // Unicode, negated |
2230 | assert_eq!( |
2231 | t(r"\D" ), |
2232 | hir_negate(hir_uclass_query(ClassQuery::Binary("digit" ))) |
2233 | ); |
2234 | assert_eq!( |
2235 | t(r"\S" ), |
2236 | hir_negate(hir_uclass_query(ClassQuery::Binary("space" ))) |
2237 | ); |
2238 | assert_eq!(t(r"\W" ), hir_negate(hir_uclass_perl_word())); |
2239 | #[cfg (feature = "unicode-case" )] |
2240 | assert_eq!( |
2241 | t(r"(?i)\D" ), |
2242 | hir_negate(hir_uclass_query(ClassQuery::Binary("digit" ))) |
2243 | ); |
2244 | #[cfg (feature = "unicode-case" )] |
2245 | assert_eq!( |
2246 | t(r"(?i)\S" ), |
2247 | hir_negate(hir_uclass_query(ClassQuery::Binary("space" ))) |
2248 | ); |
2249 | #[cfg (feature = "unicode-case" )] |
2250 | assert_eq!(t(r"(?i)\W" ), hir_negate(hir_uclass_perl_word())); |
2251 | } |
2252 | |
2253 | #[test ] |
2254 | fn class_perl_ascii() { |
2255 | // ASCII only |
2256 | assert_eq!( |
2257 | t(r"(?-u)\d" ), |
2258 | hir_ascii_bclass(&ast::ClassAsciiKind::Digit) |
2259 | ); |
2260 | assert_eq!( |
2261 | t(r"(?-u)\s" ), |
2262 | hir_ascii_bclass(&ast::ClassAsciiKind::Space) |
2263 | ); |
2264 | assert_eq!( |
2265 | t(r"(?-u)\w" ), |
2266 | hir_ascii_bclass(&ast::ClassAsciiKind::Word) |
2267 | ); |
2268 | assert_eq!( |
2269 | t(r"(?i-u)\d" ), |
2270 | hir_ascii_bclass(&ast::ClassAsciiKind::Digit) |
2271 | ); |
2272 | assert_eq!( |
2273 | t(r"(?i-u)\s" ), |
2274 | hir_ascii_bclass(&ast::ClassAsciiKind::Space) |
2275 | ); |
2276 | assert_eq!( |
2277 | t(r"(?i-u)\w" ), |
2278 | hir_ascii_bclass(&ast::ClassAsciiKind::Word) |
2279 | ); |
2280 | |
2281 | // ASCII only, negated |
2282 | assert_eq!( |
2283 | t_bytes(r"(?-u)\D" ), |
2284 | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) |
2285 | ); |
2286 | assert_eq!( |
2287 | t_bytes(r"(?-u)\S" ), |
2288 | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) |
2289 | ); |
2290 | assert_eq!( |
2291 | t_bytes(r"(?-u)\W" ), |
2292 | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) |
2293 | ); |
2294 | assert_eq!( |
2295 | t_bytes(r"(?i-u)\D" ), |
2296 | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) |
2297 | ); |
2298 | assert_eq!( |
2299 | t_bytes(r"(?i-u)\S" ), |
2300 | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) |
2301 | ); |
2302 | assert_eq!( |
2303 | t_bytes(r"(?i-u)\W" ), |
2304 | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) |
2305 | ); |
2306 | |
2307 | // ASCII only, negated, with UTF-8 mode enabled. |
2308 | // In this case, negating any Perl class results in an error because |
2309 | // all such classes can match invalid UTF-8. |
2310 | assert_eq!( |
2311 | t_err(r"(?-u)\D" ), |
2312 | TestError { |
2313 | kind: hir::ErrorKind::InvalidUtf8, |
2314 | span: Span::new( |
2315 | Position::new(5, 1, 6), |
2316 | Position::new(7, 1, 8), |
2317 | ), |
2318 | }, |
2319 | ); |
2320 | assert_eq!( |
2321 | t_err(r"(?-u)\S" ), |
2322 | TestError { |
2323 | kind: hir::ErrorKind::InvalidUtf8, |
2324 | span: Span::new( |
2325 | Position::new(5, 1, 6), |
2326 | Position::new(7, 1, 8), |
2327 | ), |
2328 | }, |
2329 | ); |
2330 | assert_eq!( |
2331 | t_err(r"(?-u)\W" ), |
2332 | TestError { |
2333 | kind: hir::ErrorKind::InvalidUtf8, |
2334 | span: Span::new( |
2335 | Position::new(5, 1, 6), |
2336 | Position::new(7, 1, 8), |
2337 | ), |
2338 | }, |
2339 | ); |
2340 | assert_eq!( |
2341 | t_err(r"(?i-u)\D" ), |
2342 | TestError { |
2343 | kind: hir::ErrorKind::InvalidUtf8, |
2344 | span: Span::new( |
2345 | Position::new(6, 1, 7), |
2346 | Position::new(8, 1, 9), |
2347 | ), |
2348 | }, |
2349 | ); |
2350 | assert_eq!( |
2351 | t_err(r"(?i-u)\S" ), |
2352 | TestError { |
2353 | kind: hir::ErrorKind::InvalidUtf8, |
2354 | span: Span::new( |
2355 | Position::new(6, 1, 7), |
2356 | Position::new(8, 1, 9), |
2357 | ), |
2358 | }, |
2359 | ); |
2360 | assert_eq!( |
2361 | t_err(r"(?i-u)\W" ), |
2362 | TestError { |
2363 | kind: hir::ErrorKind::InvalidUtf8, |
2364 | span: Span::new( |
2365 | Position::new(6, 1, 7), |
2366 | Position::new(8, 1, 9), |
2367 | ), |
2368 | }, |
2369 | ); |
2370 | } |
2371 | |
2372 | #[test ] |
2373 | #[cfg (not(feature = "unicode-perl" ))] |
2374 | fn class_perl_word_disabled() { |
2375 | assert_eq!( |
2376 | t_err(r"\w" ), |
2377 | TestError { |
2378 | kind: hir::ErrorKind::UnicodePerlClassNotFound, |
2379 | span: Span::new( |
2380 | Position::new(0, 1, 1), |
2381 | Position::new(2, 1, 3) |
2382 | ), |
2383 | } |
2384 | ); |
2385 | } |
2386 | |
2387 | #[test ] |
2388 | #[cfg (all(not(feature = "unicode-perl" ), not(feature = "unicode-bool" )))] |
2389 | fn class_perl_space_disabled() { |
2390 | assert_eq!( |
2391 | t_err(r"\s" ), |
2392 | TestError { |
2393 | kind: hir::ErrorKind::UnicodePerlClassNotFound, |
2394 | span: Span::new( |
2395 | Position::new(0, 1, 1), |
2396 | Position::new(2, 1, 3) |
2397 | ), |
2398 | } |
2399 | ); |
2400 | } |
2401 | |
2402 | #[test ] |
2403 | #[cfg (all( |
2404 | not(feature = "unicode-perl" ), |
2405 | not(feature = "unicode-gencat" ) |
2406 | ))] |
2407 | fn class_perl_digit_disabled() { |
2408 | assert_eq!( |
2409 | t_err(r"\d" ), |
2410 | TestError { |
2411 | kind: hir::ErrorKind::UnicodePerlClassNotFound, |
2412 | span: Span::new( |
2413 | Position::new(0, 1, 1), |
2414 | Position::new(2, 1, 3) |
2415 | ), |
2416 | } |
2417 | ); |
2418 | } |
2419 | |
2420 | #[test ] |
2421 | #[cfg (feature = "unicode-gencat" )] |
2422 | fn class_unicode_gencat() { |
2423 | assert_eq!(t(r"\pZ" ), hir_uclass_query(ClassQuery::Binary("Z" ))); |
2424 | assert_eq!(t(r"\pz" ), hir_uclass_query(ClassQuery::Binary("Z" ))); |
2425 | assert_eq!( |
2426 | t(r"\p{Separator}" ), |
2427 | hir_uclass_query(ClassQuery::Binary("Z" )) |
2428 | ); |
2429 | assert_eq!( |
2430 | t(r"\p{se PaRa ToR}" ), |
2431 | hir_uclass_query(ClassQuery::Binary("Z" )) |
2432 | ); |
2433 | assert_eq!( |
2434 | t(r"\p{gc:Separator}" ), |
2435 | hir_uclass_query(ClassQuery::Binary("Z" )) |
2436 | ); |
2437 | assert_eq!( |
2438 | t(r"\p{gc=Separator}" ), |
2439 | hir_uclass_query(ClassQuery::Binary("Z" )) |
2440 | ); |
2441 | assert_eq!( |
2442 | t(r"\p{Other}" ), |
2443 | hir_uclass_query(ClassQuery::Binary("Other" )) |
2444 | ); |
2445 | assert_eq!(t(r"\pC" ), hir_uclass_query(ClassQuery::Binary("Other" ))); |
2446 | |
2447 | assert_eq!( |
2448 | t(r"\PZ" ), |
2449 | hir_negate(hir_uclass_query(ClassQuery::Binary("Z" ))) |
2450 | ); |
2451 | assert_eq!( |
2452 | t(r"\P{separator}" ), |
2453 | hir_negate(hir_uclass_query(ClassQuery::Binary("Z" ))) |
2454 | ); |
2455 | assert_eq!( |
2456 | t(r"\P{gc!=separator}" ), |
2457 | hir_negate(hir_uclass_query(ClassQuery::Binary("Z" ))) |
2458 | ); |
2459 | |
2460 | assert_eq!(t(r"\p{any}" ), hir_uclass_query(ClassQuery::Binary("Any" ))); |
2461 | assert_eq!( |
2462 | t(r"\p{assigned}" ), |
2463 | hir_uclass_query(ClassQuery::Binary("Assigned" )) |
2464 | ); |
2465 | assert_eq!( |
2466 | t(r"\p{ascii}" ), |
2467 | hir_uclass_query(ClassQuery::Binary("ASCII" )) |
2468 | ); |
2469 | assert_eq!( |
2470 | t(r"\p{gc:any}" ), |
2471 | hir_uclass_query(ClassQuery::Binary("Any" )) |
2472 | ); |
2473 | assert_eq!( |
2474 | t(r"\p{gc:assigned}" ), |
2475 | hir_uclass_query(ClassQuery::Binary("Assigned" )) |
2476 | ); |
2477 | assert_eq!( |
2478 | t(r"\p{gc:ascii}" ), |
2479 | hir_uclass_query(ClassQuery::Binary("ASCII" )) |
2480 | ); |
2481 | |
2482 | assert_eq!( |
2483 | t_err(r"(?-u)\pZ" ), |
2484 | TestError { |
2485 | kind: hir::ErrorKind::UnicodeNotAllowed, |
2486 | span: Span::new( |
2487 | Position::new(5, 1, 6), |
2488 | Position::new(8, 1, 9) |
2489 | ), |
2490 | } |
2491 | ); |
2492 | assert_eq!( |
2493 | t_err(r"(?-u)\p{Separator}" ), |
2494 | TestError { |
2495 | kind: hir::ErrorKind::UnicodeNotAllowed, |
2496 | span: Span::new( |
2497 | Position::new(5, 1, 6), |
2498 | Position::new(18, 1, 19) |
2499 | ), |
2500 | } |
2501 | ); |
2502 | assert_eq!( |
2503 | t_err(r"\pE" ), |
2504 | TestError { |
2505 | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2506 | span: Span::new( |
2507 | Position::new(0, 1, 1), |
2508 | Position::new(3, 1, 4) |
2509 | ), |
2510 | } |
2511 | ); |
2512 | assert_eq!( |
2513 | t_err(r"\p{Foo}" ), |
2514 | TestError { |
2515 | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2516 | span: Span::new( |
2517 | Position::new(0, 1, 1), |
2518 | Position::new(7, 1, 8) |
2519 | ), |
2520 | } |
2521 | ); |
2522 | assert_eq!( |
2523 | t_err(r"\p{gc:Foo}" ), |
2524 | TestError { |
2525 | kind: hir::ErrorKind::UnicodePropertyValueNotFound, |
2526 | span: Span::new( |
2527 | Position::new(0, 1, 1), |
2528 | Position::new(10, 1, 11) |
2529 | ), |
2530 | } |
2531 | ); |
2532 | } |
2533 | |
2534 | #[test ] |
2535 | #[cfg (not(feature = "unicode-gencat" ))] |
2536 | fn class_unicode_gencat_disabled() { |
2537 | assert_eq!( |
2538 | t_err(r"\p{Separator}" ), |
2539 | TestError { |
2540 | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2541 | span: Span::new( |
2542 | Position::new(0, 1, 1), |
2543 | Position::new(13, 1, 14) |
2544 | ), |
2545 | } |
2546 | ); |
2547 | |
2548 | assert_eq!( |
2549 | t_err(r"\p{Any}" ), |
2550 | TestError { |
2551 | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2552 | span: Span::new( |
2553 | Position::new(0, 1, 1), |
2554 | Position::new(7, 1, 8) |
2555 | ), |
2556 | } |
2557 | ); |
2558 | } |
2559 | |
2560 | #[test ] |
2561 | #[cfg (feature = "unicode-script" )] |
2562 | fn class_unicode_script() { |
2563 | assert_eq!( |
2564 | t(r"\p{Greek}" ), |
2565 | hir_uclass_query(ClassQuery::Binary("Greek" )) |
2566 | ); |
2567 | #[cfg (feature = "unicode-case" )] |
2568 | assert_eq!( |
2569 | t(r"(?i)\p{Greek}" ), |
2570 | hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek" ))) |
2571 | ); |
2572 | #[cfg (feature = "unicode-case" )] |
2573 | assert_eq!( |
2574 | t(r"(?i)\P{Greek}" ), |
2575 | hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( |
2576 | "Greek" |
2577 | )))) |
2578 | ); |
2579 | |
2580 | assert_eq!( |
2581 | t_err(r"\p{sc:Foo}" ), |
2582 | TestError { |
2583 | kind: hir::ErrorKind::UnicodePropertyValueNotFound, |
2584 | span: Span::new( |
2585 | Position::new(0, 1, 1), |
2586 | Position::new(10, 1, 11) |
2587 | ), |
2588 | } |
2589 | ); |
2590 | assert_eq!( |
2591 | t_err(r"\p{scx:Foo}" ), |
2592 | TestError { |
2593 | kind: hir::ErrorKind::UnicodePropertyValueNotFound, |
2594 | span: Span::new( |
2595 | Position::new(0, 1, 1), |
2596 | Position::new(11, 1, 12) |
2597 | ), |
2598 | } |
2599 | ); |
2600 | } |
2601 | |
2602 | #[test ] |
2603 | #[cfg (not(feature = "unicode-script" ))] |
2604 | fn class_unicode_script_disabled() { |
2605 | assert_eq!( |
2606 | t_err(r"\p{Greek}" ), |
2607 | TestError { |
2608 | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2609 | span: Span::new( |
2610 | Position::new(0, 1, 1), |
2611 | Position::new(9, 1, 10) |
2612 | ), |
2613 | } |
2614 | ); |
2615 | |
2616 | assert_eq!( |
2617 | t_err(r"\p{scx:Greek}" ), |
2618 | TestError { |
2619 | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2620 | span: Span::new( |
2621 | Position::new(0, 1, 1), |
2622 | Position::new(13, 1, 14) |
2623 | ), |
2624 | } |
2625 | ); |
2626 | } |
2627 | |
2628 | #[test ] |
2629 | #[cfg (feature = "unicode-age" )] |
2630 | fn class_unicode_age() { |
2631 | assert_eq!( |
2632 | t_err(r"\p{age:Foo}" ), |
2633 | TestError { |
2634 | kind: hir::ErrorKind::UnicodePropertyValueNotFound, |
2635 | span: Span::new( |
2636 | Position::new(0, 1, 1), |
2637 | Position::new(11, 1, 12) |
2638 | ), |
2639 | } |
2640 | ); |
2641 | } |
2642 | |
2643 | #[test ] |
2644 | #[cfg (feature = "unicode-gencat" )] |
2645 | fn class_unicode_any_empty() { |
2646 | assert_eq!(t(r"\P{any}" ), hir_uclass(&[]),); |
2647 | } |
2648 | |
2649 | #[test ] |
2650 | #[cfg (not(feature = "unicode-age" ))] |
2651 | fn class_unicode_age_disabled() { |
2652 | assert_eq!( |
2653 | t_err(r"\p{age:3.0}" ), |
2654 | TestError { |
2655 | kind: hir::ErrorKind::UnicodePropertyNotFound, |
2656 | span: Span::new( |
2657 | Position::new(0, 1, 1), |
2658 | Position::new(11, 1, 12) |
2659 | ), |
2660 | } |
2661 | ); |
2662 | } |
2663 | |
2664 | #[test ] |
2665 | fn class_bracketed() { |
2666 | assert_eq!(t("[a]" ), hir_lit("a" )); |
2667 | assert_eq!(t("[ab]" ), hir_uclass(&[('a' , 'b' )])); |
2668 | assert_eq!(t("[^[a]]" ), class_negate(uclass(&[('a' , 'a' )]))); |
2669 | assert_eq!(t("[a-z]" ), hir_uclass(&[('a' , 'z' )])); |
2670 | assert_eq!(t("[a-fd-h]" ), hir_uclass(&[('a' , 'h' )])); |
2671 | assert_eq!(t("[a-fg-m]" ), hir_uclass(&[('a' , 'm' )])); |
2672 | assert_eq!(t(r"[\x00]" ), hir_uclass(&[(' \0' , ' \0' )])); |
2673 | assert_eq!(t(r"[\n]" ), hir_uclass(&[(' \n' , ' \n' )])); |
2674 | assert_eq!(t("[ \n]" ), hir_uclass(&[(' \n' , ' \n' )])); |
2675 | #[cfg (any(feature = "unicode-perl" , feature = "unicode-gencat" ))] |
2676 | assert_eq!(t(r"[\d]" ), hir_uclass_query(ClassQuery::Binary("digit" ))); |
2677 | #[cfg (feature = "unicode-gencat" )] |
2678 | assert_eq!( |
2679 | t(r"[\pZ]" ), |
2680 | hir_uclass_query(ClassQuery::Binary("separator" )) |
2681 | ); |
2682 | #[cfg (feature = "unicode-gencat" )] |
2683 | assert_eq!( |
2684 | t(r"[\p{separator}]" ), |
2685 | hir_uclass_query(ClassQuery::Binary("separator" )) |
2686 | ); |
2687 | #[cfg (any(feature = "unicode-perl" , feature = "unicode-gencat" ))] |
2688 | assert_eq!(t(r"[^\D]" ), hir_uclass_query(ClassQuery::Binary("digit" ))); |
2689 | #[cfg (feature = "unicode-gencat" )] |
2690 | assert_eq!( |
2691 | t(r"[^\PZ]" ), |
2692 | hir_uclass_query(ClassQuery::Binary("separator" )) |
2693 | ); |
2694 | #[cfg (feature = "unicode-gencat" )] |
2695 | assert_eq!( |
2696 | t(r"[^\P{separator}]" ), |
2697 | hir_uclass_query(ClassQuery::Binary("separator" )) |
2698 | ); |
2699 | #[cfg (all( |
2700 | feature = "unicode-case" , |
2701 | any(feature = "unicode-perl" , feature = "unicode-gencat" ) |
2702 | ))] |
2703 | assert_eq!( |
2704 | t(r"(?i)[^\D]" ), |
2705 | hir_uclass_query(ClassQuery::Binary("digit" )) |
2706 | ); |
2707 | #[cfg (all(feature = "unicode-case" , feature = "unicode-script" ))] |
2708 | assert_eq!( |
2709 | t(r"(?i)[^\P{greek}]" ), |
2710 | hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek" ))) |
2711 | ); |
2712 | |
2713 | assert_eq!(t("(?-u)[a]" ), hir_bclass(&[(b'a' , b'a' )])); |
2714 | assert_eq!(t(r"(?-u)[\x00]" ), hir_bclass(&[(b' \0' , b' \0' )])); |
2715 | assert_eq!(t_bytes(r"(?-u)[\xFF]" ), hir_bclass(&[(b' \xFF' , b' \xFF' )])); |
2716 | |
2717 | #[cfg (feature = "unicode-case" )] |
2718 | assert_eq!(t("(?i)[a]" ), hir_uclass(&[('A' , 'A' ), ('a' , 'a' )])); |
2719 | #[cfg (feature = "unicode-case" )] |
2720 | assert_eq!( |
2721 | t("(?i)[k]" ), |
2722 | hir_uclass(&[('K' , 'K' ), ('k' , 'k' ), (' \u{212A}' , ' \u{212A}' ),]) |
2723 | ); |
2724 | #[cfg (feature = "unicode-case" )] |
2725 | assert_eq!( |
2726 | t("(?i)[β]" ), |
2727 | hir_uclass(&[('Β' , 'Β' ), ('β' , 'β' ), ('ϐ' , 'ϐ' ),]) |
2728 | ); |
2729 | assert_eq!(t("(?i-u)[k]" ), hir_bclass(&[(b'K' , b'K' ), (b'k' , b'k' ),])); |
2730 | |
2731 | assert_eq!(t("[^a]" ), class_negate(uclass(&[('a' , 'a' )]))); |
2732 | assert_eq!(t(r"[^\x00]" ), class_negate(uclass(&[(' \0' , ' \0' )]))); |
2733 | assert_eq!( |
2734 | t_bytes("(?-u)[^a]" ), |
2735 | class_negate(bclass(&[(b'a' , b'a' )])) |
2736 | ); |
2737 | #[cfg (any(feature = "unicode-perl" , feature = "unicode-gencat" ))] |
2738 | assert_eq!( |
2739 | t(r"[^\d]" ), |
2740 | hir_negate(hir_uclass_query(ClassQuery::Binary("digit" ))) |
2741 | ); |
2742 | #[cfg (feature = "unicode-gencat" )] |
2743 | assert_eq!( |
2744 | t(r"[^\pZ]" ), |
2745 | hir_negate(hir_uclass_query(ClassQuery::Binary("separator" ))) |
2746 | ); |
2747 | #[cfg (feature = "unicode-gencat" )] |
2748 | assert_eq!( |
2749 | t(r"[^\p{separator}]" ), |
2750 | hir_negate(hir_uclass_query(ClassQuery::Binary("separator" ))) |
2751 | ); |
2752 | #[cfg (all(feature = "unicode-case" , feature = "unicode-script" ))] |
2753 | assert_eq!( |
2754 | t(r"(?i)[^\p{greek}]" ), |
2755 | hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( |
2756 | "greek" |
2757 | )))) |
2758 | ); |
2759 | #[cfg (all(feature = "unicode-case" , feature = "unicode-script" ))] |
2760 | assert_eq!( |
2761 | t(r"(?i)[\P{greek}]" ), |
2762 | hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( |
2763 | "greek" |
2764 | )))) |
2765 | ); |
2766 | |
2767 | // Test some weird cases. |
2768 | assert_eq!(t(r"[\[]" ), hir_uclass(&[('[' , '[' )])); |
2769 | |
2770 | assert_eq!(t(r"[&]" ), hir_uclass(&[('&' , '&' )])); |
2771 | assert_eq!(t(r"[\&]" ), hir_uclass(&[('&' , '&' )])); |
2772 | assert_eq!(t(r"[\&\&]" ), hir_uclass(&[('&' , '&' )])); |
2773 | assert_eq!(t(r"[\x00-&]" ), hir_uclass(&[(' \0' , '&' )])); |
2774 | assert_eq!(t(r"[&-\xFF]" ), hir_uclass(&[('&' , ' \u{FF}' )])); |
2775 | |
2776 | assert_eq!(t(r"[~]" ), hir_uclass(&[('~' , '~' )])); |
2777 | assert_eq!(t(r"[\~]" ), hir_uclass(&[('~' , '~' )])); |
2778 | assert_eq!(t(r"[\~\~]" ), hir_uclass(&[('~' , '~' )])); |
2779 | assert_eq!(t(r"[\x00-~]" ), hir_uclass(&[(' \0' , '~' )])); |
2780 | assert_eq!(t(r"[~-\xFF]" ), hir_uclass(&[('~' , ' \u{FF}' )])); |
2781 | |
2782 | assert_eq!(t(r"[-]" ), hir_uclass(&[('-' , '-' )])); |
2783 | assert_eq!(t(r"[\-]" ), hir_uclass(&[('-' , '-' )])); |
2784 | assert_eq!(t(r"[\-\-]" ), hir_uclass(&[('-' , '-' )])); |
2785 | assert_eq!(t(r"[\x00-\-]" ), hir_uclass(&[(' \0' , '-' )])); |
2786 | assert_eq!(t(r"[\--\xFF]" ), hir_uclass(&[('-' , ' \u{FF}' )])); |
2787 | |
2788 | assert_eq!( |
2789 | t_err("(?-u)[^a]" ), |
2790 | TestError { |
2791 | kind: hir::ErrorKind::InvalidUtf8, |
2792 | span: Span::new( |
2793 | Position::new(5, 1, 6), |
2794 | Position::new(9, 1, 10) |
2795 | ), |
2796 | } |
2797 | ); |
2798 | #[cfg (any(feature = "unicode-perl" , feature = "unicode-bool" ))] |
2799 | assert_eq!(t(r"[^\s\S]" ), hir_uclass(&[]),); |
2800 | #[cfg (any(feature = "unicode-perl" , feature = "unicode-bool" ))] |
2801 | assert_eq!(t_bytes(r"(?-u)[^\s\S]" ), hir_bclass(&[]),); |
2802 | } |
2803 | |
2804 | #[test ] |
2805 | fn class_bracketed_union() { |
2806 | assert_eq!(t("[a-zA-Z]" ), hir_uclass(&[('A' , 'Z' ), ('a' , 'z' )])); |
2807 | #[cfg (feature = "unicode-gencat" )] |
2808 | assert_eq!( |
2809 | t(r"[a\pZb]" ), |
2810 | hir_union( |
2811 | hir_uclass(&[('a' , 'b' )]), |
2812 | hir_uclass_query(ClassQuery::Binary("separator" )) |
2813 | ) |
2814 | ); |
2815 | #[cfg (all(feature = "unicode-gencat" , feature = "unicode-script" ))] |
2816 | assert_eq!( |
2817 | t(r"[\pZ\p{Greek}]" ), |
2818 | hir_union( |
2819 | hir_uclass_query(ClassQuery::Binary("greek" )), |
2820 | hir_uclass_query(ClassQuery::Binary("separator" )) |
2821 | ) |
2822 | ); |
2823 | #[cfg (all( |
2824 | feature = "unicode-age" , |
2825 | feature = "unicode-gencat" , |
2826 | feature = "unicode-script" |
2827 | ))] |
2828 | assert_eq!( |
2829 | t(r"[\p{age:3.0}\pZ\p{Greek}]" ), |
2830 | hir_union( |
2831 | hir_uclass_query(ClassQuery::ByValue { |
2832 | property_name: "age" , |
2833 | property_value: "3.0" , |
2834 | }), |
2835 | hir_union( |
2836 | hir_uclass_query(ClassQuery::Binary("greek" )), |
2837 | hir_uclass_query(ClassQuery::Binary("separator" )) |
2838 | ) |
2839 | ) |
2840 | ); |
2841 | #[cfg (all( |
2842 | feature = "unicode-age" , |
2843 | feature = "unicode-gencat" , |
2844 | feature = "unicode-script" |
2845 | ))] |
2846 | assert_eq!( |
2847 | t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]" ), |
2848 | hir_union( |
2849 | hir_uclass_query(ClassQuery::ByValue { |
2850 | property_name: "age" , |
2851 | property_value: "3.0" , |
2852 | }), |
2853 | hir_union( |
2854 | hir_uclass_query(ClassQuery::Binary("cyrillic" )), |
2855 | hir_union( |
2856 | hir_uclass_query(ClassQuery::Binary("greek" )), |
2857 | hir_uclass_query(ClassQuery::Binary("separator" )) |
2858 | ) |
2859 | ) |
2860 | ) |
2861 | ); |
2862 | |
2863 | #[cfg (all( |
2864 | feature = "unicode-age" , |
2865 | feature = "unicode-case" , |
2866 | feature = "unicode-gencat" , |
2867 | feature = "unicode-script" |
2868 | ))] |
2869 | assert_eq!( |
2870 | t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]" ), |
2871 | hir_case_fold(hir_union( |
2872 | hir_uclass_query(ClassQuery::ByValue { |
2873 | property_name: "age" , |
2874 | property_value: "3.0" , |
2875 | }), |
2876 | hir_union( |
2877 | hir_uclass_query(ClassQuery::Binary("greek" )), |
2878 | hir_uclass_query(ClassQuery::Binary("separator" )) |
2879 | ) |
2880 | )) |
2881 | ); |
2882 | #[cfg (all( |
2883 | feature = "unicode-age" , |
2884 | feature = "unicode-gencat" , |
2885 | feature = "unicode-script" |
2886 | ))] |
2887 | assert_eq!( |
2888 | t(r"[^\p{age:3.0}\pZ\p{Greek}]" ), |
2889 | hir_negate(hir_union( |
2890 | hir_uclass_query(ClassQuery::ByValue { |
2891 | property_name: "age" , |
2892 | property_value: "3.0" , |
2893 | }), |
2894 | hir_union( |
2895 | hir_uclass_query(ClassQuery::Binary("greek" )), |
2896 | hir_uclass_query(ClassQuery::Binary("separator" )) |
2897 | ) |
2898 | )) |
2899 | ); |
2900 | #[cfg (all( |
2901 | feature = "unicode-age" , |
2902 | feature = "unicode-case" , |
2903 | feature = "unicode-gencat" , |
2904 | feature = "unicode-script" |
2905 | ))] |
2906 | assert_eq!( |
2907 | t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]" ), |
2908 | hir_negate(hir_case_fold(hir_union( |
2909 | hir_uclass_query(ClassQuery::ByValue { |
2910 | property_name: "age" , |
2911 | property_value: "3.0" , |
2912 | }), |
2913 | hir_union( |
2914 | hir_uclass_query(ClassQuery::Binary("greek" )), |
2915 | hir_uclass_query(ClassQuery::Binary("separator" )) |
2916 | ) |
2917 | ))) |
2918 | ); |
2919 | } |
2920 | |
2921 | #[test ] |
2922 | fn class_bracketed_nested() { |
2923 | assert_eq!(t(r"[a[^c]]" ), class_negate(uclass(&[('c' , 'c' )]))); |
2924 | assert_eq!(t(r"[a-b[^c]]" ), class_negate(uclass(&[('c' , 'c' )]))); |
2925 | assert_eq!(t(r"[a-c[^c]]" ), class_negate(uclass(&[]))); |
2926 | |
2927 | assert_eq!(t(r"[^a[^c]]" ), hir_uclass(&[('c' , 'c' )])); |
2928 | assert_eq!(t(r"[^a-b[^c]]" ), hir_uclass(&[('c' , 'c' )])); |
2929 | |
2930 | #[cfg (feature = "unicode-case" )] |
2931 | assert_eq!( |
2932 | t(r"(?i)[a[^c]]" ), |
2933 | hir_negate(class_case_fold(uclass(&[('c' , 'c' )]))) |
2934 | ); |
2935 | #[cfg (feature = "unicode-case" )] |
2936 | assert_eq!( |
2937 | t(r"(?i)[a-b[^c]]" ), |
2938 | hir_negate(class_case_fold(uclass(&[('c' , 'c' )]))) |
2939 | ); |
2940 | |
2941 | #[cfg (feature = "unicode-case" )] |
2942 | assert_eq!(t(r"(?i)[^a[^c]]" ), hir_uclass(&[('C' , 'C' ), ('c' , 'c' )])); |
2943 | #[cfg (feature = "unicode-case" )] |
2944 | assert_eq!( |
2945 | t(r"(?i)[^a-b[^c]]" ), |
2946 | hir_uclass(&[('C' , 'C' ), ('c' , 'c' )]) |
2947 | ); |
2948 | |
2949 | assert_eq!(t(r"[^a-c[^c]]" ), hir_uclass(&[]),); |
2950 | #[cfg (feature = "unicode-case" )] |
2951 | assert_eq!(t(r"(?i)[^a-c[^c]]" ), hir_uclass(&[]),); |
2952 | } |
2953 | |
2954 | #[test ] |
2955 | fn class_bracketed_intersect() { |
2956 | assert_eq!(t("[abc&&b-c]" ), hir_uclass(&[('b' , 'c' )])); |
2957 | assert_eq!(t("[abc&&[b-c]]" ), hir_uclass(&[('b' , 'c' )])); |
2958 | assert_eq!(t("[[abc]&&[b-c]]" ), hir_uclass(&[('b' , 'c' )])); |
2959 | assert_eq!(t("[a-z&&b-y&&c-x]" ), hir_uclass(&[('c' , 'x' )])); |
2960 | assert_eq!(t("[c-da-b&&a-d]" ), hir_uclass(&[('a' , 'd' )])); |
2961 | assert_eq!(t("[a-d&&c-da-b]" ), hir_uclass(&[('a' , 'd' )])); |
2962 | assert_eq!(t(r"[a-z&&a-c]" ), hir_uclass(&[('a' , 'c' )])); |
2963 | assert_eq!(t(r"[[a-z&&a-c]]" ), hir_uclass(&[('a' , 'c' )])); |
2964 | assert_eq!(t(r"[^[a-z&&a-c]]" ), hir_negate(hir_uclass(&[('a' , 'c' )]))); |
2965 | |
2966 | assert_eq!(t("(?-u)[abc&&b-c]" ), hir_bclass(&[(b'b' , b'c' )])); |
2967 | assert_eq!(t("(?-u)[abc&&[b-c]]" ), hir_bclass(&[(b'b' , b'c' )])); |
2968 | assert_eq!(t("(?-u)[[abc]&&[b-c]]" ), hir_bclass(&[(b'b' , b'c' )])); |
2969 | assert_eq!(t("(?-u)[a-z&&b-y&&c-x]" ), hir_bclass(&[(b'c' , b'x' )])); |
2970 | assert_eq!(t("(?-u)[c-da-b&&a-d]" ), hir_bclass(&[(b'a' , b'd' )])); |
2971 | assert_eq!(t("(?-u)[a-d&&c-da-b]" ), hir_bclass(&[(b'a' , b'd' )])); |
2972 | |
2973 | #[cfg (feature = "unicode-case" )] |
2974 | assert_eq!( |
2975 | t("(?i)[abc&&b-c]" ), |
2976 | hir_case_fold(hir_uclass(&[('b' , 'c' )])) |
2977 | ); |
2978 | #[cfg (feature = "unicode-case" )] |
2979 | assert_eq!( |
2980 | t("(?i)[abc&&[b-c]]" ), |
2981 | hir_case_fold(hir_uclass(&[('b' , 'c' )])) |
2982 | ); |
2983 | #[cfg (feature = "unicode-case" )] |
2984 | assert_eq!( |
2985 | t("(?i)[[abc]&&[b-c]]" ), |
2986 | hir_case_fold(hir_uclass(&[('b' , 'c' )])) |
2987 | ); |
2988 | #[cfg (feature = "unicode-case" )] |
2989 | assert_eq!( |
2990 | t("(?i)[a-z&&b-y&&c-x]" ), |
2991 | hir_case_fold(hir_uclass(&[('c' , 'x' )])) |
2992 | ); |
2993 | #[cfg (feature = "unicode-case" )] |
2994 | assert_eq!( |
2995 | t("(?i)[c-da-b&&a-d]" ), |
2996 | hir_case_fold(hir_uclass(&[('a' , 'd' )])) |
2997 | ); |
2998 | #[cfg (feature = "unicode-case" )] |
2999 | assert_eq!( |
3000 | t("(?i)[a-d&&c-da-b]" ), |
3001 | hir_case_fold(hir_uclass(&[('a' , 'd' )])) |
3002 | ); |
3003 | |
3004 | assert_eq!( |
3005 | t("(?i-u)[abc&&b-c]" ), |
3006 | hir_case_fold(hir_bclass(&[(b'b' , b'c' )])) |
3007 | ); |
3008 | assert_eq!( |
3009 | t("(?i-u)[abc&&[b-c]]" ), |
3010 | hir_case_fold(hir_bclass(&[(b'b' , b'c' )])) |
3011 | ); |
3012 | assert_eq!( |
3013 | t("(?i-u)[[abc]&&[b-c]]" ), |
3014 | hir_case_fold(hir_bclass(&[(b'b' , b'c' )])) |
3015 | ); |
3016 | assert_eq!( |
3017 | t("(?i-u)[a-z&&b-y&&c-x]" ), |
3018 | hir_case_fold(hir_bclass(&[(b'c' , b'x' )])) |
3019 | ); |
3020 | assert_eq!( |
3021 | t("(?i-u)[c-da-b&&a-d]" ), |
3022 | hir_case_fold(hir_bclass(&[(b'a' , b'd' )])) |
3023 | ); |
3024 | assert_eq!( |
3025 | t("(?i-u)[a-d&&c-da-b]" ), |
3026 | hir_case_fold(hir_bclass(&[(b'a' , b'd' )])) |
3027 | ); |
3028 | |
3029 | // In `[a^]`, `^` does not need to be escaped, so it makes sense that |
3030 | // `^` is also allowed to be unescaped after `&&`. |
3031 | assert_eq!(t(r"[\^&&^]" ), hir_uclass(&[('^' , '^' )])); |
3032 | // `]` needs to be escaped after `&&` since it's not at start of class. |
3033 | assert_eq!(t(r"[]&&\]]" ), hir_uclass(&[(']' , ']' )])); |
3034 | assert_eq!(t(r"[-&&-]" ), hir_uclass(&[('-' , '-' )])); |
3035 | assert_eq!(t(r"[\&&&&]" ), hir_uclass(&[('&' , '&' )])); |
3036 | assert_eq!(t(r"[\&&&\&]" ), hir_uclass(&[('&' , '&' )])); |
3037 | // Test precedence. |
3038 | assert_eq!( |
3039 | t(r"[a-w&&[^c-g]z]" ), |
3040 | hir_uclass(&[('a' , 'b' ), ('h' , 'w' )]) |
3041 | ); |
3042 | } |
3043 | |
3044 | #[test ] |
3045 | fn class_bracketed_intersect_negate() { |
3046 | #[cfg (feature = "unicode-perl" )] |
3047 | assert_eq!( |
3048 | t(r"[^\w&&\d]" ), |
3049 | hir_negate(hir_uclass_query(ClassQuery::Binary("digit" ))) |
3050 | ); |
3051 | assert_eq!(t(r"[^[a-z&&a-c]]" ), hir_negate(hir_uclass(&[('a' , 'c' )]))); |
3052 | #[cfg (feature = "unicode-perl" )] |
3053 | assert_eq!( |
3054 | t(r"[^[\w&&\d]]" ), |
3055 | hir_negate(hir_uclass_query(ClassQuery::Binary("digit" ))) |
3056 | ); |
3057 | #[cfg (feature = "unicode-perl" )] |
3058 | assert_eq!( |
3059 | t(r"[^[^\w&&\d]]" ), |
3060 | hir_uclass_query(ClassQuery::Binary("digit" )) |
3061 | ); |
3062 | #[cfg (feature = "unicode-perl" )] |
3063 | assert_eq!(t(r"[[[^\w]&&[^\d]]]" ), hir_negate(hir_uclass_perl_word())); |
3064 | |
3065 | #[cfg (feature = "unicode-perl" )] |
3066 | assert_eq!( |
3067 | t_bytes(r"(?-u)[^\w&&\d]" ), |
3068 | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) |
3069 | ); |
3070 | assert_eq!( |
3071 | t_bytes(r"(?-u)[^[a-z&&a-c]]" ), |
3072 | hir_negate(hir_bclass(&[(b'a' , b'c' )])) |
3073 | ); |
3074 | assert_eq!( |
3075 | t_bytes(r"(?-u)[^[\w&&\d]]" ), |
3076 | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) |
3077 | ); |
3078 | assert_eq!( |
3079 | t_bytes(r"(?-u)[^[^\w&&\d]]" ), |
3080 | hir_ascii_bclass(&ast::ClassAsciiKind::Digit) |
3081 | ); |
3082 | assert_eq!( |
3083 | t_bytes(r"(?-u)[[[^\w]&&[^\d]]]" ), |
3084 | hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) |
3085 | ); |
3086 | } |
3087 | |
3088 | #[test ] |
3089 | fn class_bracketed_difference() { |
3090 | #[cfg (feature = "unicode-gencat" )] |
3091 | assert_eq!( |
3092 | t(r"[\pL--[:ascii:]]" ), |
3093 | hir_difference( |
3094 | hir_uclass_query(ClassQuery::Binary("letter" )), |
3095 | hir_uclass(&[(' \0' , ' \x7F' )]) |
3096 | ) |
3097 | ); |
3098 | |
3099 | assert_eq!( |
3100 | t(r"(?-u)[[:alpha:]--[:lower:]]" ), |
3101 | hir_bclass(&[(b'A' , b'Z' )]) |
3102 | ); |
3103 | } |
3104 | |
3105 | #[test ] |
3106 | fn class_bracketed_symmetric_difference() { |
3107 | #[cfg (feature = "unicode-script" )] |
3108 | assert_eq!( |
3109 | t(r"[\p{sc:Greek}~~\p{scx:Greek}]" ), |
3110 | hir_uclass(&[ |
3111 | (' \u{0342}' , ' \u{0342}' ), |
3112 | (' \u{0345}' , ' \u{0345}' ), |
3113 | (' \u{1DC0}' , ' \u{1DC1}' ), |
3114 | ]) |
3115 | ); |
3116 | assert_eq!(t(r"[a-g~~c-j]" ), hir_uclass(&[('a' , 'b' ), ('h' , 'j' )])); |
3117 | |
3118 | assert_eq!( |
3119 | t(r"(?-u)[a-g~~c-j]" ), |
3120 | hir_bclass(&[(b'a' , b'b' ), (b'h' , b'j' )]) |
3121 | ); |
3122 | } |
3123 | |
3124 | #[test ] |
3125 | fn ignore_whitespace() { |
3126 | assert_eq!(t(r"(?x)\12 3" ), hir_lit(" \n3" )); |
3127 | assert_eq!(t(r"(?x)\x { 53 }" ), hir_lit("S" )); |
3128 | assert_eq!( |
3129 | t(r"(?x)\x # comment |
3130 | { # comment |
3131 | 53 # comment |
3132 | } #comment" ), |
3133 | hir_lit("S" ) |
3134 | ); |
3135 | |
3136 | assert_eq!(t(r"(?x)\x 53" ), hir_lit("S" )); |
3137 | assert_eq!( |
3138 | t(r"(?x)\x # comment |
3139 | 53 # comment" ), |
3140 | hir_lit("S" ) |
3141 | ); |
3142 | assert_eq!(t(r"(?x)\x5 3" ), hir_lit("S" )); |
3143 | |
3144 | #[cfg (feature = "unicode-gencat" )] |
3145 | assert_eq!( |
3146 | t(r"(?x)\p # comment |
3147 | { # comment |
3148 | Separator # comment |
3149 | } # comment" ), |
3150 | hir_uclass_query(ClassQuery::Binary("separator" )) |
3151 | ); |
3152 | |
3153 | assert_eq!( |
3154 | t(r"(?x)a # comment |
3155 | { # comment |
3156 | 5 # comment |
3157 | , # comment |
3158 | 10 # comment |
3159 | } # comment" ), |
3160 | hir_range(true, 5, Some(10), hir_lit("a" )) |
3161 | ); |
3162 | |
3163 | assert_eq!(t(r"(?x)a\ # hi there" ), hir_lit("a " )); |
3164 | } |
3165 | |
3166 | #[test ] |
3167 | fn analysis_is_utf8() { |
3168 | // Positive examples. |
3169 | assert!(props_bytes(r"a" ).is_utf8()); |
3170 | assert!(props_bytes(r"ab" ).is_utf8()); |
3171 | assert!(props_bytes(r"(?-u)a" ).is_utf8()); |
3172 | assert!(props_bytes(r"(?-u)ab" ).is_utf8()); |
3173 | assert!(props_bytes(r"\xFF" ).is_utf8()); |
3174 | assert!(props_bytes(r"\xFF\xFF" ).is_utf8()); |
3175 | assert!(props_bytes(r"[^a]" ).is_utf8()); |
3176 | assert!(props_bytes(r"[^a][^a]" ).is_utf8()); |
3177 | assert!(props_bytes(r"\b" ).is_utf8()); |
3178 | assert!(props_bytes(r"\B" ).is_utf8()); |
3179 | assert!(props_bytes(r"(?-u)\b" ).is_utf8()); |
3180 | assert!(props_bytes(r"(?-u)\B" ).is_utf8()); |
3181 | |
3182 | // Negative examples. |
3183 | assert!(!props_bytes(r"(?-u)\xFF" ).is_utf8()); |
3184 | assert!(!props_bytes(r"(?-u)\xFF\xFF" ).is_utf8()); |
3185 | assert!(!props_bytes(r"(?-u)[^a]" ).is_utf8()); |
3186 | assert!(!props_bytes(r"(?-u)[^a][^a]" ).is_utf8()); |
3187 | } |
3188 | |
3189 | #[test ] |
3190 | fn analysis_captures_len() { |
3191 | assert_eq!(0, props(r"a" ).explicit_captures_len()); |
3192 | assert_eq!(0, props(r"(?:a)" ).explicit_captures_len()); |
3193 | assert_eq!(0, props(r"(?i-u:a)" ).explicit_captures_len()); |
3194 | assert_eq!(0, props(r"(?i-u)a" ).explicit_captures_len()); |
3195 | assert_eq!(1, props(r"(a)" ).explicit_captures_len()); |
3196 | assert_eq!(1, props(r"(?P<foo>a)" ).explicit_captures_len()); |
3197 | assert_eq!(1, props(r"()" ).explicit_captures_len()); |
3198 | assert_eq!(1, props(r"()a" ).explicit_captures_len()); |
3199 | assert_eq!(1, props(r"(a)+" ).explicit_captures_len()); |
3200 | assert_eq!(2, props(r"(a)(b)" ).explicit_captures_len()); |
3201 | assert_eq!(2, props(r"(a)|(b)" ).explicit_captures_len()); |
3202 | assert_eq!(2, props(r"((a))" ).explicit_captures_len()); |
3203 | assert_eq!(1, props(r"([a&&b])" ).explicit_captures_len()); |
3204 | } |
3205 | |
3206 | #[test ] |
3207 | fn analysis_static_captures_len() { |
3208 | let len = |pattern| props(pattern).static_explicit_captures_len(); |
3209 | assert_eq!(Some(0), len(r"" )); |
3210 | assert_eq!(Some(0), len(r"foo|bar" )); |
3211 | assert_eq!(None, len(r"(foo)|bar" )); |
3212 | assert_eq!(None, len(r"foo|(bar)" )); |
3213 | assert_eq!(Some(1), len(r"(foo|bar)" )); |
3214 | assert_eq!(Some(1), len(r"(a|b|c|d|e|f)" )); |
3215 | assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)" )); |
3216 | assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)" )); |
3217 | assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)" )); |
3218 | assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()" )); |
3219 | assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)" )); |
3220 | assert_eq!(None, len(r"(a)(b)(extra)?" )); |
3221 | assert_eq!(Some(1), len(r"(foo)|(bar)" )); |
3222 | assert_eq!(Some(2), len(r"(foo)(bar)" )); |
3223 | assert_eq!(Some(2), len(r"(foo)+(bar)" )); |
3224 | assert_eq!(None, len(r"(foo)*(bar)" )); |
3225 | assert_eq!(Some(0), len(r"(foo)?{0}" )); |
3226 | assert_eq!(None, len(r"(foo)?{1}" )); |
3227 | assert_eq!(Some(1), len(r"(foo){1}" )); |
3228 | assert_eq!(Some(1), len(r"(foo){1,}" )); |
3229 | assert_eq!(Some(1), len(r"(foo){1,}?" )); |
3230 | assert_eq!(None, len(r"(foo){1,}??" )); |
3231 | assert_eq!(None, len(r"(foo){0,}" )); |
3232 | assert_eq!(Some(1), len(r"(foo)(?:bar)" )); |
3233 | assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))" )); |
3234 | assert_eq!(Some(2), len(r"(?P<bar>foo)(?:bar)(bal|loon)" )); |
3235 | assert_eq!( |
3236 | Some(2), |
3237 | len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""# ) |
3238 | ); |
3239 | } |
3240 | |
3241 | #[test ] |
3242 | fn analysis_is_all_assertions() { |
3243 | // Positive examples. |
3244 | let p = props(r"\b" ); |
3245 | assert!(!p.look_set().is_empty()); |
3246 | assert_eq!(p.minimum_len(), Some(0)); |
3247 | |
3248 | let p = props(r"\B" ); |
3249 | assert!(!p.look_set().is_empty()); |
3250 | assert_eq!(p.minimum_len(), Some(0)); |
3251 | |
3252 | let p = props(r"^" ); |
3253 | assert!(!p.look_set().is_empty()); |
3254 | assert_eq!(p.minimum_len(), Some(0)); |
3255 | |
3256 | let p = props(r"$" ); |
3257 | assert!(!p.look_set().is_empty()); |
3258 | assert_eq!(p.minimum_len(), Some(0)); |
3259 | |
3260 | let p = props(r"\A" ); |
3261 | assert!(!p.look_set().is_empty()); |
3262 | assert_eq!(p.minimum_len(), Some(0)); |
3263 | |
3264 | let p = props(r"\z" ); |
3265 | assert!(!p.look_set().is_empty()); |
3266 | assert_eq!(p.minimum_len(), Some(0)); |
3267 | |
3268 | let p = props(r"$^\z\A\b\B" ); |
3269 | assert!(!p.look_set().is_empty()); |
3270 | assert_eq!(p.minimum_len(), Some(0)); |
3271 | |
3272 | let p = props(r"$|^|\z|\A|\b|\B" ); |
3273 | assert!(!p.look_set().is_empty()); |
3274 | assert_eq!(p.minimum_len(), Some(0)); |
3275 | |
3276 | let p = props(r"^$|$^" ); |
3277 | assert!(!p.look_set().is_empty()); |
3278 | assert_eq!(p.minimum_len(), Some(0)); |
3279 | |
3280 | let p = props(r"((\b)+())*^" ); |
3281 | assert!(!p.look_set().is_empty()); |
3282 | assert_eq!(p.minimum_len(), Some(0)); |
3283 | |
3284 | // Negative examples. |
3285 | let p = props(r"^a" ); |
3286 | assert!(!p.look_set().is_empty()); |
3287 | assert_eq!(p.minimum_len(), Some(1)); |
3288 | } |
3289 | |
3290 | #[test ] |
3291 | fn analysis_look_set_prefix_any() { |
3292 | let p = props(r"(?-u)(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))" ); |
3293 | assert!(p.look_set_prefix_any().contains(Look::WordAscii)); |
3294 | } |
3295 | |
3296 | #[test ] |
3297 | fn analysis_is_anchored() { |
3298 | let is_start = |p| props(p).look_set_prefix().contains(Look::Start); |
3299 | let is_end = |p| props(p).look_set_suffix().contains(Look::End); |
3300 | |
3301 | // Positive examples. |
3302 | assert!(is_start(r"^" )); |
3303 | assert!(is_end(r"$" )); |
3304 | |
3305 | assert!(is_start(r"^^" )); |
3306 | assert!(props(r"$$" ).look_set_suffix().contains(Look::End)); |
3307 | |
3308 | assert!(is_start(r"^$" )); |
3309 | assert!(is_end(r"^$" )); |
3310 | |
3311 | assert!(is_start(r"^foo" )); |
3312 | assert!(is_end(r"foo$" )); |
3313 | |
3314 | assert!(is_start(r"^foo|^bar" )); |
3315 | assert!(is_end(r"foo$|bar$" )); |
3316 | |
3317 | assert!(is_start(r"^(foo|bar)" )); |
3318 | assert!(is_end(r"(foo|bar)$" )); |
3319 | |
3320 | assert!(is_start(r"^+" )); |
3321 | assert!(is_end(r"$+" )); |
3322 | assert!(is_start(r"^++" )); |
3323 | assert!(is_end(r"$++" )); |
3324 | assert!(is_start(r"(^)+" )); |
3325 | assert!(is_end(r"($)+" )); |
3326 | |
3327 | assert!(is_start(r"$^" )); |
3328 | assert!(is_start(r"$^" )); |
3329 | assert!(is_start(r"$^|^$" )); |
3330 | assert!(is_end(r"$^|^$" )); |
3331 | |
3332 | assert!(is_start(r"\b^" )); |
3333 | assert!(is_end(r"$\b" )); |
3334 | assert!(is_start(r"^(?m:^)" )); |
3335 | assert!(is_end(r"(?m:$)$" )); |
3336 | assert!(is_start(r"(?m:^)^" )); |
3337 | assert!(is_end(r"$(?m:$)" )); |
3338 | |
3339 | // Negative examples. |
3340 | assert!(!is_start(r"(?m)^" )); |
3341 | assert!(!is_end(r"(?m)$" )); |
3342 | assert!(!is_start(r"(?m:^$)|$^" )); |
3343 | assert!(!is_end(r"(?m:^$)|$^" )); |
3344 | assert!(!is_start(r"$^|(?m:^$)" )); |
3345 | assert!(!is_end(r"$^|(?m:^$)" )); |
3346 | |
3347 | assert!(!is_start(r"a^" )); |
3348 | assert!(!is_start(r"$a" )); |
3349 | |
3350 | assert!(!is_end(r"a^" )); |
3351 | assert!(!is_end(r"$a" )); |
3352 | |
3353 | assert!(!is_start(r"^foo|bar" )); |
3354 | assert!(!is_end(r"foo|bar$" )); |
3355 | |
3356 | assert!(!is_start(r"^*" )); |
3357 | assert!(!is_end(r"$*" )); |
3358 | assert!(!is_start(r"^*+" )); |
3359 | assert!(!is_end(r"$*+" )); |
3360 | assert!(!is_start(r"^+*" )); |
3361 | assert!(!is_end(r"$+*" )); |
3362 | assert!(!is_start(r"(^)*" )); |
3363 | assert!(!is_end(r"($)*" )); |
3364 | } |
3365 | |
3366 | #[test ] |
3367 | fn analysis_is_any_anchored() { |
3368 | let is_start = |p| props(p).look_set().contains(Look::Start); |
3369 | let is_end = |p| props(p).look_set().contains(Look::End); |
3370 | |
3371 | // Positive examples. |
3372 | assert!(is_start(r"^" )); |
3373 | assert!(is_end(r"$" )); |
3374 | assert!(is_start(r"\A" )); |
3375 | assert!(is_end(r"\z" )); |
3376 | |
3377 | // Negative examples. |
3378 | assert!(!is_start(r"(?m)^" )); |
3379 | assert!(!is_end(r"(?m)$" )); |
3380 | assert!(!is_start(r"$" )); |
3381 | assert!(!is_end(r"^" )); |
3382 | } |
3383 | |
3384 | #[test ] |
3385 | fn analysis_can_empty() { |
3386 | // Positive examples. |
3387 | let assert_empty = |
3388 | |p| assert_eq!(Some(0), props_bytes(p).minimum_len()); |
3389 | assert_empty(r"" ); |
3390 | assert_empty(r"()" ); |
3391 | assert_empty(r"()*" ); |
3392 | assert_empty(r"()+" ); |
3393 | assert_empty(r"()?" ); |
3394 | assert_empty(r"a*" ); |
3395 | assert_empty(r"a?" ); |
3396 | assert_empty(r"a{0}" ); |
3397 | assert_empty(r"a{0,}" ); |
3398 | assert_empty(r"a{0,1}" ); |
3399 | assert_empty(r"a{0,10}" ); |
3400 | #[cfg (feature = "unicode-gencat" )] |
3401 | assert_empty(r"\pL*" ); |
3402 | assert_empty(r"a*|b" ); |
3403 | assert_empty(r"b|a*" ); |
3404 | assert_empty(r"a|" ); |
3405 | assert_empty(r"|a" ); |
3406 | assert_empty(r"a||b" ); |
3407 | assert_empty(r"a*a?(abcd)*" ); |
3408 | assert_empty(r"^" ); |
3409 | assert_empty(r"$" ); |
3410 | assert_empty(r"(?m)^" ); |
3411 | assert_empty(r"(?m)$" ); |
3412 | assert_empty(r"\A" ); |
3413 | assert_empty(r"\z" ); |
3414 | assert_empty(r"\B" ); |
3415 | assert_empty(r"(?-u)\B" ); |
3416 | assert_empty(r"\b" ); |
3417 | assert_empty(r"(?-u)\b" ); |
3418 | |
3419 | // Negative examples. |
3420 | let assert_non_empty = |
3421 | |p| assert_ne!(Some(0), props_bytes(p).minimum_len()); |
3422 | assert_non_empty(r"a+" ); |
3423 | assert_non_empty(r"a{1}" ); |
3424 | assert_non_empty(r"a{1,}" ); |
3425 | assert_non_empty(r"a{1,2}" ); |
3426 | assert_non_empty(r"a{1,10}" ); |
3427 | assert_non_empty(r"b|a" ); |
3428 | assert_non_empty(r"a*a+(abcd)*" ); |
3429 | #[cfg (feature = "unicode-gencat" )] |
3430 | assert_non_empty(r"\P{any}" ); |
3431 | assert_non_empty(r"[a--a]" ); |
3432 | assert_non_empty(r"[a&&b]" ); |
3433 | } |
3434 | |
3435 | #[test ] |
3436 | fn analysis_is_literal() { |
3437 | // Positive examples. |
3438 | assert!(props(r"a" ).is_literal()); |
3439 | assert!(props(r"ab" ).is_literal()); |
3440 | assert!(props(r"abc" ).is_literal()); |
3441 | assert!(props(r"(?m)abc" ).is_literal()); |
3442 | assert!(props(r"(?:a)" ).is_literal()); |
3443 | assert!(props(r"foo(?:a)" ).is_literal()); |
3444 | assert!(props(r"(?:a)foo" ).is_literal()); |
3445 | assert!(props(r"[a]" ).is_literal()); |
3446 | |
3447 | // Negative examples. |
3448 | assert!(!props(r"" ).is_literal()); |
3449 | assert!(!props(r"^" ).is_literal()); |
3450 | assert!(!props(r"a|b" ).is_literal()); |
3451 | assert!(!props(r"(a)" ).is_literal()); |
3452 | assert!(!props(r"a+" ).is_literal()); |
3453 | assert!(!props(r"foo(a)" ).is_literal()); |
3454 | assert!(!props(r"(a)foo" ).is_literal()); |
3455 | assert!(!props(r"[ab]" ).is_literal()); |
3456 | } |
3457 | |
3458 | #[test ] |
3459 | fn analysis_is_alternation_literal() { |
3460 | // Positive examples. |
3461 | assert!(props(r"a" ).is_alternation_literal()); |
3462 | assert!(props(r"ab" ).is_alternation_literal()); |
3463 | assert!(props(r"abc" ).is_alternation_literal()); |
3464 | assert!(props(r"(?m)abc" ).is_alternation_literal()); |
3465 | assert!(props(r"foo|bar" ).is_alternation_literal()); |
3466 | assert!(props(r"foo|bar|baz" ).is_alternation_literal()); |
3467 | assert!(props(r"[a]" ).is_alternation_literal()); |
3468 | assert!(props(r"(?:ab)|cd" ).is_alternation_literal()); |
3469 | assert!(props(r"ab|(?:cd)" ).is_alternation_literal()); |
3470 | |
3471 | // Negative examples. |
3472 | assert!(!props(r"" ).is_alternation_literal()); |
3473 | assert!(!props(r"^" ).is_alternation_literal()); |
3474 | assert!(!props(r"(a)" ).is_alternation_literal()); |
3475 | assert!(!props(r"a+" ).is_alternation_literal()); |
3476 | assert!(!props(r"foo(a)" ).is_alternation_literal()); |
3477 | assert!(!props(r"(a)foo" ).is_alternation_literal()); |
3478 | assert!(!props(r"[ab]" ).is_alternation_literal()); |
3479 | assert!(!props(r"[ab]|b" ).is_alternation_literal()); |
3480 | assert!(!props(r"a|[ab]" ).is_alternation_literal()); |
3481 | assert!(!props(r"(a)|b" ).is_alternation_literal()); |
3482 | assert!(!props(r"a|(b)" ).is_alternation_literal()); |
3483 | assert!(!props(r"a|b" ).is_alternation_literal()); |
3484 | assert!(!props(r"a|b|c" ).is_alternation_literal()); |
3485 | assert!(!props(r"[a]|b" ).is_alternation_literal()); |
3486 | assert!(!props(r"a|[b]" ).is_alternation_literal()); |
3487 | assert!(!props(r"(?:a)|b" ).is_alternation_literal()); |
3488 | assert!(!props(r"a|(?:b)" ).is_alternation_literal()); |
3489 | assert!(!props(r"(?:z|xx)@|xx" ).is_alternation_literal()); |
3490 | } |
3491 | |
3492 | // This tests that the smart Hir::concat constructor simplifies the given |
3493 | // exprs in a way we expect. |
3494 | #[test ] |
3495 | fn smart_concat() { |
3496 | assert_eq!(t("" ), Hir::empty()); |
3497 | assert_eq!(t("(?:)" ), Hir::empty()); |
3498 | assert_eq!(t("abc" ), hir_lit("abc" )); |
3499 | assert_eq!(t("(?:foo)(?:bar)" ), hir_lit("foobar" )); |
3500 | assert_eq!(t("quux(?:foo)(?:bar)baz" ), hir_lit("quuxfoobarbaz" )); |
3501 | assert_eq!( |
3502 | t("foo(?:bar^baz)quux" ), |
3503 | hir_cat(vec![ |
3504 | hir_lit("foobar" ), |
3505 | hir_look(hir::Look::Start), |
3506 | hir_lit("bazquux" ), |
3507 | ]) |
3508 | ); |
3509 | assert_eq!( |
3510 | t("foo(?:ba(?:r^b)az)quux" ), |
3511 | hir_cat(vec![ |
3512 | hir_lit("foobar" ), |
3513 | hir_look(hir::Look::Start), |
3514 | hir_lit("bazquux" ), |
3515 | ]) |
3516 | ); |
3517 | } |
3518 | |
3519 | // This tests that the smart Hir::alternation constructor simplifies the |
3520 | // given exprs in a way we expect. |
3521 | #[test ] |
3522 | fn smart_alternation() { |
3523 | assert_eq!( |
3524 | t("(?:foo)|(?:bar)" ), |
3525 | hir_alt(vec![hir_lit("foo" ), hir_lit("bar" )]) |
3526 | ); |
3527 | assert_eq!( |
3528 | t("quux|(?:abc|def|xyz)|baz" ), |
3529 | hir_alt(vec![ |
3530 | hir_lit("quux" ), |
3531 | hir_lit("abc" ), |
3532 | hir_lit("def" ), |
3533 | hir_lit("xyz" ), |
3534 | hir_lit("baz" ), |
3535 | ]) |
3536 | ); |
3537 | assert_eq!( |
3538 | t("quux|(?:abc|(?:def|mno)|xyz)|baz" ), |
3539 | hir_alt(vec![ |
3540 | hir_lit("quux" ), |
3541 | hir_lit("abc" ), |
3542 | hir_lit("def" ), |
3543 | hir_lit("mno" ), |
3544 | hir_lit("xyz" ), |
3545 | hir_lit("baz" ), |
3546 | ]) |
3547 | ); |
3548 | assert_eq!( |
3549 | t("a|b|c|d|e|f|x|y|z" ), |
3550 | hir_uclass(&[('a' , 'f' ), ('x' , 'z' )]), |
3551 | ); |
3552 | // Tests that we lift common prefixes out of an alternation. |
3553 | assert_eq!( |
3554 | t("[A-Z]foo|[A-Z]quux" ), |
3555 | hir_cat(vec![ |
3556 | hir_uclass(&[('A' , 'Z' )]), |
3557 | hir_alt(vec![hir_lit("foo" ), hir_lit("quux" )]), |
3558 | ]), |
3559 | ); |
3560 | assert_eq!( |
3561 | t("[A-Z][A-Z]|[A-Z]quux" ), |
3562 | hir_cat(vec![ |
3563 | hir_uclass(&[('A' , 'Z' )]), |
3564 | hir_alt(vec![hir_uclass(&[('A' , 'Z' )]), hir_lit("quux" )]), |
3565 | ]), |
3566 | ); |
3567 | assert_eq!( |
3568 | t("[A-Z][A-Z]|[A-Z][A-Z]quux" ), |
3569 | hir_cat(vec![ |
3570 | hir_uclass(&[('A' , 'Z' )]), |
3571 | hir_uclass(&[('A' , 'Z' )]), |
3572 | hir_alt(vec![Hir::empty(), hir_lit("quux" )]), |
3573 | ]), |
3574 | ); |
3575 | assert_eq!( |
3576 | t("[A-Z]foo|[A-Z]foobar" ), |
3577 | hir_cat(vec![ |
3578 | hir_uclass(&[('A' , 'Z' )]), |
3579 | hir_alt(vec![hir_lit("foo" ), hir_lit("foobar" )]), |
3580 | ]), |
3581 | ); |
3582 | } |
3583 | } |
3584 | |