1 | /*! |
2 | This module provides a regular expression printer for `Hir`. |
3 | */ |
4 | |
5 | use core::fmt; |
6 | |
7 | use crate::{ |
8 | hir::{ |
9 | self, |
10 | visitor::{self, Visitor}, |
11 | Hir, HirKind, |
12 | }, |
13 | is_meta_character, |
14 | }; |
15 | |
16 | /// A builder for constructing a printer. |
17 | /// |
18 | /// Note that since a printer doesn't have any configuration knobs, this type |
19 | /// remains unexported. |
20 | #[derive(Clone, Debug)] |
21 | struct PrinterBuilder { |
22 | _priv: (), |
23 | } |
24 | |
25 | impl Default for PrinterBuilder { |
26 | fn default() -> PrinterBuilder { |
27 | PrinterBuilder::new() |
28 | } |
29 | } |
30 | |
31 | impl PrinterBuilder { |
32 | fn new() -> PrinterBuilder { |
33 | PrinterBuilder { _priv: () } |
34 | } |
35 | |
36 | fn build(&self) -> Printer { |
37 | Printer { _priv: () } |
38 | } |
39 | } |
40 | |
41 | /// A printer for a regular expression's high-level intermediate |
42 | /// representation. |
43 | /// |
44 | /// A printer converts a high-level intermediate representation (HIR) to a |
45 | /// regular expression pattern string. This particular printer uses constant |
46 | /// stack space and heap space proportional to the size of the HIR. |
47 | /// |
48 | /// Since this printer is only using the HIR, the pattern it prints will likely |
49 | /// not resemble the original pattern at all. For example, a pattern like |
50 | /// `\pL` will have its entire class written out. |
51 | /// |
52 | /// The purpose of this printer is to provide a means to mutate an HIR and then |
53 | /// build a regular expression from the result of that mutation. (A regex |
54 | /// library could provide a constructor from this HIR explicitly, but that |
55 | /// creates an unnecessary public coupling between the regex library and this |
56 | /// specific HIR representation.) |
57 | #[derive(Debug)] |
58 | pub struct Printer { |
59 | _priv: (), |
60 | } |
61 | |
62 | impl Printer { |
63 | /// Create a new printer. |
64 | pub fn new() -> Printer { |
65 | PrinterBuilder::new().build() |
66 | } |
67 | |
68 | /// Print the given `Ast` to the given writer. The writer must implement |
69 | /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used |
70 | /// here are a `fmt::Formatter` (which is available in `fmt::Display` |
71 | /// implementations) or a `&mut String`. |
72 | pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result { |
73 | visitor::visit(hir, Writer { wtr }) |
74 | } |
75 | } |
76 | |
77 | #[derive(Debug)] |
78 | struct Writer<W> { |
79 | wtr: W, |
80 | } |
81 | |
82 | impl<W: fmt::Write> Visitor for Writer<W> { |
83 | type Output = (); |
84 | type Err = fmt::Error; |
85 | |
86 | fn finish(self) -> fmt::Result { |
87 | Ok(()) |
88 | } |
89 | |
90 | fn visit_pre(&mut self, hir: &Hir) -> fmt::Result { |
91 | match *hir.kind() { |
92 | HirKind::Empty => { |
93 | // Technically an empty sub-expression could be "printed" by |
94 | // just ignoring it, but in practice, you could have a |
95 | // repetition operator attached to an empty expression, and you |
96 | // really need something in the concrete syntax to make that |
97 | // work as you'd expect. |
98 | self.wtr.write_str(r"(?:)" )?; |
99 | } |
100 | // Repetition operators are strictly suffix oriented. |
101 | HirKind::Repetition(_) => {} |
102 | HirKind::Literal(hir::Literal(ref bytes)) => { |
103 | // See the comment on the 'Concat' and 'Alternation' case below |
104 | // for why we put parens here. Literals are, conceptually, |
105 | // a special case of concatenation where each element is a |
106 | // character. The HIR flattens this into a Box<[u8]>, but we |
107 | // still need to treat it like a concatenation for correct |
108 | // printing. As a special case, we don't write parens if there |
109 | // is only one character. One character means there is no |
110 | // concat so we don't need parens. Adding parens would still be |
111 | // correct, but we drop them here because it tends to create |
112 | // rather noisy regexes even in simple cases. |
113 | let result = core::str::from_utf8(bytes); |
114 | let len = result.map_or(bytes.len(), |s| s.chars().count()); |
115 | if len > 1 { |
116 | self.wtr.write_str(r"(?:" )?; |
117 | } |
118 | match result { |
119 | Ok(string) => { |
120 | for c in string.chars() { |
121 | self.write_literal_char(c)?; |
122 | } |
123 | } |
124 | Err(_) => { |
125 | for &b in bytes.iter() { |
126 | self.write_literal_byte(b)?; |
127 | } |
128 | } |
129 | } |
130 | if len > 1 { |
131 | self.wtr.write_str(r")" )?; |
132 | } |
133 | } |
134 | HirKind::Class(hir::Class::Unicode(ref cls)) => { |
135 | if cls.ranges().is_empty() { |
136 | return self.wtr.write_str("[a&&b]" ); |
137 | } |
138 | self.wtr.write_str("[" )?; |
139 | for range in cls.iter() { |
140 | if range.start() == range.end() { |
141 | self.write_literal_char(range.start())?; |
142 | } else if u32::from(range.start()) + 1 |
143 | == u32::from(range.end()) |
144 | { |
145 | self.write_literal_char(range.start())?; |
146 | self.write_literal_char(range.end())?; |
147 | } else { |
148 | self.write_literal_char(range.start())?; |
149 | self.wtr.write_str("-" )?; |
150 | self.write_literal_char(range.end())?; |
151 | } |
152 | } |
153 | self.wtr.write_str("]" )?; |
154 | } |
155 | HirKind::Class(hir::Class::Bytes(ref cls)) => { |
156 | if cls.ranges().is_empty() { |
157 | return self.wtr.write_str("[a&&b]" ); |
158 | } |
159 | self.wtr.write_str("(?-u:[" )?; |
160 | for range in cls.iter() { |
161 | if range.start() == range.end() { |
162 | self.write_literal_class_byte(range.start())?; |
163 | } else if range.start() + 1 == range.end() { |
164 | self.write_literal_class_byte(range.start())?; |
165 | self.write_literal_class_byte(range.end())?; |
166 | } else { |
167 | self.write_literal_class_byte(range.start())?; |
168 | self.wtr.write_str("-" )?; |
169 | self.write_literal_class_byte(range.end())?; |
170 | } |
171 | } |
172 | self.wtr.write_str("])" )?; |
173 | } |
174 | HirKind::Look(ref look) => match *look { |
175 | hir::Look::Start => { |
176 | self.wtr.write_str(r"\A" )?; |
177 | } |
178 | hir::Look::End => { |
179 | self.wtr.write_str(r"\z" )?; |
180 | } |
181 | hir::Look::StartLF => { |
182 | self.wtr.write_str("(?m:^)" )?; |
183 | } |
184 | hir::Look::EndLF => { |
185 | self.wtr.write_str("(?m:$)" )?; |
186 | } |
187 | hir::Look::StartCRLF => { |
188 | self.wtr.write_str("(?mR:^)" )?; |
189 | } |
190 | hir::Look::EndCRLF => { |
191 | self.wtr.write_str("(?mR:$)" )?; |
192 | } |
193 | hir::Look::WordAscii => { |
194 | self.wtr.write_str(r"(?-u:\b)" )?; |
195 | } |
196 | hir::Look::WordAsciiNegate => { |
197 | self.wtr.write_str(r"(?-u:\B)" )?; |
198 | } |
199 | hir::Look::WordUnicode => { |
200 | self.wtr.write_str(r"\b" )?; |
201 | } |
202 | hir::Look::WordUnicodeNegate => { |
203 | self.wtr.write_str(r"\B" )?; |
204 | } |
205 | hir::Look::WordStartAscii => { |
206 | self.wtr.write_str(r"(?-u:\b{start})" )?; |
207 | } |
208 | hir::Look::WordEndAscii => { |
209 | self.wtr.write_str(r"(?-u:\b{end})" )?; |
210 | } |
211 | hir::Look::WordStartUnicode => { |
212 | self.wtr.write_str(r"\b{start}" )?; |
213 | } |
214 | hir::Look::WordEndUnicode => { |
215 | self.wtr.write_str(r"\b{end}" )?; |
216 | } |
217 | hir::Look::WordStartHalfAscii => { |
218 | self.wtr.write_str(r"(?-u:\b{start-half})" )?; |
219 | } |
220 | hir::Look::WordEndHalfAscii => { |
221 | self.wtr.write_str(r"(?-u:\b{end-half})" )?; |
222 | } |
223 | hir::Look::WordStartHalfUnicode => { |
224 | self.wtr.write_str(r"\b{start-half}" )?; |
225 | } |
226 | hir::Look::WordEndHalfUnicode => { |
227 | self.wtr.write_str(r"\b{end-half}" )?; |
228 | } |
229 | }, |
230 | HirKind::Capture(hir::Capture { ref name, .. }) => { |
231 | self.wtr.write_str("(" )?; |
232 | if let Some(ref name) = *name { |
233 | write!(self.wtr, "?P<{}>" , name)?; |
234 | } |
235 | } |
236 | // Why do this? Wrapping concats and alts in non-capturing groups |
237 | // is not *always* necessary, but is sometimes necessary. For |
238 | // example, 'concat(a, alt(b, c))' should be written as 'a(?:b|c)' |
239 | // and not 'ab|c'. The former is clearly the intended meaning, but |
240 | // the latter is actually 'alt(concat(a, b), c)'. |
241 | // |
242 | // It would be possible to only group these things in cases where |
243 | // it's strictly necessary, but it requires knowing the parent |
244 | // expression. And since this technique is simpler and always |
245 | // correct, we take this route. More to the point, it is a non-goal |
246 | // of an HIR printer to show a nice easy-to-read regex. Indeed, |
247 | // its construction forbids it from doing so. Therefore, inserting |
248 | // extra groups where they aren't necessary is perfectly okay. |
249 | HirKind::Concat(_) | HirKind::Alternation(_) => { |
250 | self.wtr.write_str(r"(?:" )?; |
251 | } |
252 | } |
253 | Ok(()) |
254 | } |
255 | |
256 | fn visit_post(&mut self, hir: &Hir) -> fmt::Result { |
257 | match *hir.kind() { |
258 | // Handled during visit_pre |
259 | HirKind::Empty |
260 | | HirKind::Literal(_) |
261 | | HirKind::Class(_) |
262 | | HirKind::Look(_) => {} |
263 | HirKind::Repetition(ref x) => { |
264 | match (x.min, x.max) { |
265 | (0, Some(1)) => { |
266 | self.wtr.write_str("?" )?; |
267 | } |
268 | (0, None) => { |
269 | self.wtr.write_str("*" )?; |
270 | } |
271 | (1, None) => { |
272 | self.wtr.write_str("+" )?; |
273 | } |
274 | (1, Some(1)) => { |
275 | // 'a{1}' and 'a{1}?' are exactly equivalent to 'a'. |
276 | return Ok(()); |
277 | } |
278 | (m, None) => { |
279 | write!(self.wtr, "{{{},}}" , m)?; |
280 | } |
281 | (m, Some(n)) if m == n => { |
282 | write!(self.wtr, "{{{}}}" , m)?; |
283 | // a{m} and a{m}? are always exactly equivalent. |
284 | return Ok(()); |
285 | } |
286 | (m, Some(n)) => { |
287 | write!(self.wtr, "{{{},{}}}" , m, n)?; |
288 | } |
289 | } |
290 | if !x.greedy { |
291 | self.wtr.write_str("?" )?; |
292 | } |
293 | } |
294 | HirKind::Capture(_) |
295 | | HirKind::Concat(_) |
296 | | HirKind::Alternation(_) => { |
297 | self.wtr.write_str(r")" )?; |
298 | } |
299 | } |
300 | Ok(()) |
301 | } |
302 | |
303 | fn visit_alternation_in(&mut self) -> fmt::Result { |
304 | self.wtr.write_str("|" ) |
305 | } |
306 | } |
307 | |
308 | impl<W: fmt::Write> Writer<W> { |
309 | fn write_literal_char(&mut self, c: char) -> fmt::Result { |
310 | if is_meta_character(c) { |
311 | self.wtr.write_str(" \\" )?; |
312 | } |
313 | self.wtr.write_char(c) |
314 | } |
315 | |
316 | fn write_literal_byte(&mut self, b: u8) -> fmt::Result { |
317 | if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { |
318 | self.write_literal_char(char::try_from(b).unwrap()) |
319 | } else { |
320 | write!(self.wtr, "(?-u: \\x{:02X})" , b) |
321 | } |
322 | } |
323 | |
324 | fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result { |
325 | if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { |
326 | self.write_literal_char(char::try_from(b).unwrap()) |
327 | } else { |
328 | write!(self.wtr, " \\x{:02X}" , b) |
329 | } |
330 | } |
331 | } |
332 | |
333 | #[cfg (test)] |
334 | mod tests { |
335 | use alloc::{ |
336 | boxed::Box, |
337 | string::{String, ToString}, |
338 | }; |
339 | |
340 | use crate::ParserBuilder; |
341 | |
342 | use super::*; |
343 | |
344 | fn roundtrip(given: &str, expected: &str) { |
345 | roundtrip_with(|b| b, given, expected); |
346 | } |
347 | |
348 | fn roundtrip_bytes(given: &str, expected: &str) { |
349 | roundtrip_with(|b| b.utf8(false), given, expected); |
350 | } |
351 | |
352 | fn roundtrip_with<F>(mut f: F, given: &str, expected: &str) |
353 | where |
354 | F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder, |
355 | { |
356 | let mut builder = ParserBuilder::new(); |
357 | f(&mut builder); |
358 | let hir = builder.build().parse(given).unwrap(); |
359 | |
360 | let mut printer = Printer::new(); |
361 | let mut dst = String::new(); |
362 | printer.print(&hir, &mut dst).unwrap(); |
363 | |
364 | // Check that the result is actually valid. |
365 | builder.build().parse(&dst).unwrap(); |
366 | |
367 | assert_eq!(expected, dst); |
368 | } |
369 | |
370 | #[test] |
371 | fn print_literal() { |
372 | roundtrip("a" , "a" ); |
373 | roundtrip(r"\xff" , " \u{FF}" ); |
374 | roundtrip_bytes(r"\xff" , " \u{FF}" ); |
375 | roundtrip_bytes(r"(?-u)\xff" , r"(?-u:\xFF)" ); |
376 | roundtrip("☃" , "☃" ); |
377 | } |
378 | |
379 | #[test] |
380 | fn print_class() { |
381 | roundtrip(r"[a]" , r"a" ); |
382 | roundtrip(r"[ab]" , r"[ab]" ); |
383 | roundtrip(r"[a-z]" , r"[a-z]" ); |
384 | roundtrip(r"[a-z--b-c--x-y]" , r"[ad-wz]" ); |
385 | roundtrip(r"[^\x01-\u{10FFFF}]" , " \u{0}" ); |
386 | roundtrip(r"[-]" , r"\-" ); |
387 | roundtrip(r"[☃-⛄]" , r"[☃-⛄]" ); |
388 | |
389 | roundtrip(r"(?-u)[a]" , r"a" ); |
390 | roundtrip(r"(?-u)[ab]" , r"(?-u:[ab])" ); |
391 | roundtrip(r"(?-u)[a-z]" , r"(?-u:[a-z])" ); |
392 | roundtrip_bytes(r"(?-u)[a-\xFF]" , r"(?-u:[a-\xFF])" ); |
393 | |
394 | // The following test that the printer escapes meta characters |
395 | // in character classes. |
396 | roundtrip(r"[\[]" , r"\[" ); |
397 | roundtrip(r"[Z-_]" , r"[Z-_]" ); |
398 | roundtrip(r"[Z-_--Z]" , r"[\[-_]" ); |
399 | |
400 | // The following test that the printer escapes meta characters |
401 | // in byte oriented character classes. |
402 | roundtrip_bytes(r"(?-u)[\[]" , r"\[" ); |
403 | roundtrip_bytes(r"(?-u)[Z-_]" , r"(?-u:[Z-_])" ); |
404 | roundtrip_bytes(r"(?-u)[Z-_--Z]" , r"(?-u:[\[-_])" ); |
405 | |
406 | // This tests that an empty character class is correctly roundtripped. |
407 | #[cfg (feature = "unicode-gencat" )] |
408 | roundtrip(r"\P{any}" , r"[a&&b]" ); |
409 | roundtrip_bytes(r"(?-u)[^\x00-\xFF]" , r"[a&&b]" ); |
410 | } |
411 | |
412 | #[test] |
413 | fn print_anchor() { |
414 | roundtrip(r"^" , r"\A" ); |
415 | roundtrip(r"$" , r"\z" ); |
416 | roundtrip(r"(?m)^" , r"(?m:^)" ); |
417 | roundtrip(r"(?m)$" , r"(?m:$)" ); |
418 | } |
419 | |
420 | #[test] |
421 | fn print_word_boundary() { |
422 | roundtrip(r"\b" , r"\b" ); |
423 | roundtrip(r"\B" , r"\B" ); |
424 | roundtrip(r"(?-u)\b" , r"(?-u:\b)" ); |
425 | roundtrip_bytes(r"(?-u)\B" , r"(?-u:\B)" ); |
426 | } |
427 | |
428 | #[test] |
429 | fn print_repetition() { |
430 | roundtrip("a?" , "a?" ); |
431 | roundtrip("a??" , "a??" ); |
432 | roundtrip("(?U)a?" , "a??" ); |
433 | |
434 | roundtrip("a*" , "a*" ); |
435 | roundtrip("a*?" , "a*?" ); |
436 | roundtrip("(?U)a*" , "a*?" ); |
437 | |
438 | roundtrip("a+" , "a+" ); |
439 | roundtrip("a+?" , "a+?" ); |
440 | roundtrip("(?U)a+" , "a+?" ); |
441 | |
442 | roundtrip("a{1}" , "a" ); |
443 | roundtrip("a{2}" , "a{2}" ); |
444 | roundtrip("a{1,}" , "a+" ); |
445 | roundtrip("a{1,5}" , "a{1,5}" ); |
446 | roundtrip("a{1}?" , "a" ); |
447 | roundtrip("a{2}?" , "a{2}" ); |
448 | roundtrip("a{1,}?" , "a+?" ); |
449 | roundtrip("a{1,5}?" , "a{1,5}?" ); |
450 | roundtrip("(?U)a{1}" , "a" ); |
451 | roundtrip("(?U)a{2}" , "a{2}" ); |
452 | roundtrip("(?U)a{1,}" , "a+?" ); |
453 | roundtrip("(?U)a{1,5}" , "a{1,5}?" ); |
454 | |
455 | // Test that various zero-length repetitions always translate to an |
456 | // empty regex. This is more a property of HIR's smart constructors |
457 | // than the printer though. |
458 | roundtrip("a{0}" , "(?:)" ); |
459 | roundtrip("(?:ab){0}" , "(?:)" ); |
460 | #[cfg (feature = "unicode-gencat" )] |
461 | { |
462 | roundtrip(r"\p{any}{0}" , "(?:)" ); |
463 | roundtrip(r"\P{any}{0}" , "(?:)" ); |
464 | } |
465 | } |
466 | |
467 | #[test] |
468 | fn print_group() { |
469 | roundtrip("()" , "((?:))" ); |
470 | roundtrip("(?P<foo>)" , "(?P<foo>(?:))" ); |
471 | roundtrip("(?:)" , "(?:)" ); |
472 | |
473 | roundtrip("(a)" , "(a)" ); |
474 | roundtrip("(?P<foo>a)" , "(?P<foo>a)" ); |
475 | roundtrip("(?:a)" , "a" ); |
476 | |
477 | roundtrip("((((a))))" , "((((a))))" ); |
478 | } |
479 | |
480 | #[test] |
481 | fn print_alternation() { |
482 | roundtrip("|" , "(?:(?:)|(?:))" ); |
483 | roundtrip("||" , "(?:(?:)|(?:)|(?:))" ); |
484 | |
485 | roundtrip("a|b" , "[ab]" ); |
486 | roundtrip("ab|cd" , "(?:(?:ab)|(?:cd))" ); |
487 | roundtrip("a|b|c" , "[a-c]" ); |
488 | roundtrip("ab|cd|ef" , "(?:(?:ab)|(?:cd)|(?:ef))" ); |
489 | roundtrip("foo|bar|quux" , "(?:(?:foo)|(?:bar)|(?:quux))" ); |
490 | } |
491 | |
492 | // This is a regression test that stresses a peculiarity of how the HIR |
493 | // is both constructed and printed. Namely, it is legal for a repetition |
494 | // to directly contain a concatenation. This particular construct isn't |
495 | // really possible to build from the concrete syntax directly, since you'd |
496 | // be forced to put the concatenation into (at least) a non-capturing |
497 | // group. Concurrently, the printer doesn't consider this case and just |
498 | // kind of naively prints the child expression and tacks on the repetition |
499 | // operator. |
500 | // |
501 | // As a result, if you attached '+' to a 'concat(a, b)', the printer gives |
502 | // you 'ab+', but clearly it really should be '(?:ab)+'. |
503 | // |
504 | // This bug isn't easy to surface because most ways of building an HIR |
505 | // come directly from the concrete syntax, and as mentioned above, it just |
506 | // isn't possible to build this kind of HIR from the concrete syntax. |
507 | // Nevertheless, this is definitely a bug. |
508 | // |
509 | // See: https://github.com/rust-lang/regex/issues/731 |
510 | #[test] |
511 | fn regression_repetition_concat() { |
512 | let expr = Hir::concat(alloc::vec![ |
513 | Hir::literal("x" .as_bytes()), |
514 | Hir::repetition(hir::Repetition { |
515 | min: 1, |
516 | max: None, |
517 | greedy: true, |
518 | sub: Box::new(Hir::literal("ab" .as_bytes())), |
519 | }), |
520 | Hir::literal("y" .as_bytes()), |
521 | ]); |
522 | assert_eq!(r"(?:x(?:ab)+y)" , expr.to_string()); |
523 | |
524 | let expr = Hir::concat(alloc::vec![ |
525 | Hir::look(hir::Look::Start), |
526 | Hir::repetition(hir::Repetition { |
527 | min: 1, |
528 | max: None, |
529 | greedy: true, |
530 | sub: Box::new(Hir::concat(alloc::vec![ |
531 | Hir::look(hir::Look::Start), |
532 | Hir::look(hir::Look::End), |
533 | ])), |
534 | }), |
535 | Hir::look(hir::Look::End), |
536 | ]); |
537 | assert_eq!(r"(?:\A\A\z\z)" , expr.to_string()); |
538 | } |
539 | |
540 | // Just like regression_repetition_concat, but with the repetition using |
541 | // an alternation as a child expression instead. |
542 | // |
543 | // See: https://github.com/rust-lang/regex/issues/731 |
544 | #[test] |
545 | fn regression_repetition_alternation() { |
546 | let expr = Hir::concat(alloc::vec![ |
547 | Hir::literal("ab" .as_bytes()), |
548 | Hir::repetition(hir::Repetition { |
549 | min: 1, |
550 | max: None, |
551 | greedy: true, |
552 | sub: Box::new(Hir::alternation(alloc::vec![ |
553 | Hir::literal("cd" .as_bytes()), |
554 | Hir::literal("ef" .as_bytes()), |
555 | ])), |
556 | }), |
557 | Hir::literal("gh" .as_bytes()), |
558 | ]); |
559 | assert_eq!(r"(?:(?:ab)(?:(?:cd)|(?:ef))+(?:gh))" , expr.to_string()); |
560 | |
561 | let expr = Hir::concat(alloc::vec![ |
562 | Hir::look(hir::Look::Start), |
563 | Hir::repetition(hir::Repetition { |
564 | min: 1, |
565 | max: None, |
566 | greedy: true, |
567 | sub: Box::new(Hir::alternation(alloc::vec![ |
568 | Hir::look(hir::Look::Start), |
569 | Hir::look(hir::Look::End), |
570 | ])), |
571 | }), |
572 | Hir::look(hir::Look::End), |
573 | ]); |
574 | assert_eq!(r"(?:\A(?:\A|\z)\z)" , expr.to_string()); |
575 | } |
576 | |
577 | // This regression test is very similar in flavor to |
578 | // regression_repetition_concat in that the root of the issue lies in a |
579 | // peculiarity of how the HIR is represented and how the printer writes it |
580 | // out. Like the other regression, this one is also rooted in the fact that |
581 | // you can't produce the peculiar HIR from the concrete syntax. Namely, you |
582 | // just can't have a 'concat(a, alt(b, c))' because the 'alt' will normally |
583 | // be in (at least) a non-capturing group. Why? Because the '|' has very |
584 | // low precedence (lower that concatenation), and so something like 'ab|c' |
585 | // is actually 'alt(ab, c)'. |
586 | // |
587 | // See: https://github.com/rust-lang/regex/issues/516 |
588 | #[test] |
589 | fn regression_alternation_concat() { |
590 | let expr = Hir::concat(alloc::vec![ |
591 | Hir::literal("ab" .as_bytes()), |
592 | Hir::alternation(alloc::vec![ |
593 | Hir::literal("mn" .as_bytes()), |
594 | Hir::literal("xy" .as_bytes()), |
595 | ]), |
596 | ]); |
597 | assert_eq!(r"(?:(?:ab)(?:(?:mn)|(?:xy)))" , expr.to_string()); |
598 | |
599 | let expr = Hir::concat(alloc::vec![ |
600 | Hir::look(hir::Look::Start), |
601 | Hir::alternation(alloc::vec![ |
602 | Hir::look(hir::Look::Start), |
603 | Hir::look(hir::Look::End), |
604 | ]), |
605 | ]); |
606 | assert_eq!(r"(?:\A(?:\A|\z))" , expr.to_string()); |
607 | } |
608 | } |
609 | |