print.rs source code [crates/regex-syntax/src/hir/print.rs]

1	/!*
2	This module provides a regular expression printer for `Hir`.
3	*/
4
5	use core::fmt;
6
7	use crate::{
8	hir::{
9	self,
10	visitor::{self, Visitor},
11	Hir, HirKind,
12	},
13	is_meta_character,
14	};
15
16	/// A builder for constructing a printer.
17	///
18	/// Note that since a printer doesn't have any configuration knobs, this type
19	/// remains unexported.
20	#[derive(Clone, Debug)]
21	struct PrinterBuilder {
22	_priv: (),
23	}
24
25	impl Default for PrinterBuilder {
26	fn default() -> PrinterBuilder {
27	PrinterBuilder::new()
28	}
29	}
30
31	impl PrinterBuilder {
32	fn new() -> PrinterBuilder {
33	PrinterBuilder { _priv: () }
34	}
35
36	fn build(&self) -> Printer {
37	Printer { _priv: () }
38	}
39	}
40
41	/// A printer for a regular expression's high-level intermediate
42	/// representation.
43	///
44	/// A printer converts a high-level intermediate representation (HIR) to a
45	/// regular expression pattern string. This particular printer uses constant
46	/// stack space and heap space proportional to the size of the HIR.
47	///
48	/// Since this printer is only using the HIR, the pattern it prints will likely
49	/// not resemble the original pattern at all. For example, a pattern like
50	/// `\pL` will have its entire class written out.
51	///
52	/// The purpose of this printer is to provide a means to mutate an HIR and then
53	/// build a regular expression from the result of that mutation. (A regex
54	/// library could provide a constructor from this HIR explicitly, but that
55	/// creates an unnecessary public coupling between the regex library and this
56	/// specific HIR representation.)
57	#[derive(Debug)]
58	pub struct Printer {
59	_priv: (),
60	}
61
62	impl Printer {
63	/// Create a new printer.
64	pub fn new() -> Printer {
65	PrinterBuilder::new().build()
66	}
67
68	/// Print the given `Ast` to the given writer. The writer must implement
69	/// `fmt::Write`. Typical implementations of `fmt::Write` that can be used
70	/// here are a `fmt::Formatter` (which is available in `fmt::Display`
71	/// implementations) or a `&mut String`.
72	pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result {
73	visitor::visit(hir, visitor:Writer { wtr })
74	}
75	}
76
77	#[derive(Debug)]
78	struct Writer<W> {
79	wtr: W,
80	}
81
82	impl<W: fmt::Write> Visitor for Writer<W> {
83	type Output = ();
84	type Err = fmt::Error;
85
86	fn finish(self) -> fmt::Result {
87	Ok(())
88	}
89
90	fn visit_pre(&mut self, hir: &Hir) -> fmt::Result {
91	match *hir.kind() {
92	HirKind::Empty => {
93	// Technically an empty sub-expression could be "printed" by
94	// just ignoring it, but in practice, you could have a
95	// repetition operator attached to an empty expression, and you
96	// really need something in the concrete syntax to make that
97	// work as you'd expect.
98	self.wtr.write_str(r"(?:)")?;
99	}
100	// Repetition operators are strictly suffix oriented.
101	HirKind::Repetition(_) => {}
102	HirKind::Literal(hir::Literal(ref bytes)) => {
103	// See the comment on the 'Concat' and 'Alternation' case below
104	// for why we put parens here. Literals are, conceptually,
105	// a special case of concatenation where each element is a
106	// character. The HIR flattens this into a Box<[u8]>, but we
107	// still need to treat it like a concatenation for correct
108	// printing. As a special case, we don't write parens if there
109	// is only one character. One character means there is no
110	// concat so we don't need parens. Adding parens would still be
111	// correct, but we drop them here because it tends to create
112	// rather noisy regexes even in simple cases.
113	let result = core::str::from_utf8(bytes);
114	let len = result.map_or(bytes.len(), \|s\| s.chars().count());
115	if len > `1` {
116	self.wtr.write_str(r"(?:")?;
117	}
118	match result {
119	Ok(string) => {
120	for c in string.chars() {
121	self.write_literal_char(c)?;
122	}
123	}
124	Err(_) => {
125	for &b in bytes.iter() {
126	self.write_literal_byte(b)?;
127	}
128	}
129	}
130	if len > `1` {
131	self.wtr.write_str(r")")?;
132	}
133	}
134	HirKind::Class(hir::Class::Unicode(ref cls)) => {
135	if cls.ranges().is_empty() {
136	return self.wtr.write_str("[a&&b]");
137	}
138	self.wtr.write_str("[")?;
139	for range in cls.iter() {
140	if range.start() == range.end() {
141	self.write_literal_char(range.start())?;
142	} else if u32::from(range.start()) + `1`
143	== u32::from(range.end())
144	{
145	self.write_literal_char(range.start())?;
146	self.write_literal_char(range.end())?;
147	} else {
148	self.write_literal_char(range.start())?;
149	self.wtr.write_str("-")?;
150	self.write_literal_char(range.end())?;
151	}
152	}
153	self.wtr.write_str("]")?;
154	}
155	HirKind::Class(hir::Class::Bytes(ref cls)) => {
156	if cls.ranges().is_empty() {
157	return self.wtr.write_str("[a&&b]");
158	}
159	self.wtr.write_str("(?-u:[")?;
160	for range in cls.iter() {
161	if range.start() == range.end() {
162	self.write_literal_class_byte(range.start())?;
163	} else if range.start() + `1` == range.end() {
164	self.write_literal_class_byte(range.start())?;
165	self.write_literal_class_byte(range.end())?;
166	} else {
167	self.write_literal_class_byte(range.start())?;
168	self.wtr.write_str("-")?;
169	self.write_literal_class_byte(range.end())?;
170	}
171	}
172	self.wtr.write_str("])")?;
173	}
174	HirKind::Look(ref look) => match *look {
175	hir::Look::Start => {
176	self.wtr.write_str(r"\A")?;
177	}
178	hir::Look::End => {
179	self.wtr.write_str(r"\z")?;
180	}
181	hir::Look::StartLF => {
182	self.wtr.write_str("(?m:^)")?;
183	}
184	hir::Look::EndLF => {
185	self.wtr.write_str("(?m:$)")?;
186	}
187	hir::Look::StartCRLF => {
188	self.wtr.write_str("(?mR:^)")?;
189	}
190	hir::Look::EndCRLF => {
191	self.wtr.write_str("(?mR:$)")?;
192	}
193	hir::Look::WordAscii => {
194	self.wtr.write_str(r"(?-u:\b)")?;
195	}
196	hir::Look::WordAsciiNegate => {
197	self.wtr.write_str(r"(?-u:\B)")?;
198	}
199	hir::Look::WordUnicode => {
200	self.wtr.write_str(r"\b")?;
201	}
202	hir::Look::WordUnicodeNegate => {
203	self.wtr.write_str(r"\B")?;
204	}
205	hir::Look::WordStartAscii => {
206	self.wtr.write_str(r"(?-u:\b{start})")?;
207	}
208	hir::Look::WordEndAscii => {
209	self.wtr.write_str(r"(?-u:\b{end})")?;
210	}
211	hir::Look::WordStartUnicode => {
212	self.wtr.write_str(r"\b{start}")?;
213	}
214	hir::Look::WordEndUnicode => {
215	self.wtr.write_str(r"\b{end}")?;
216	}
217	hir::Look::WordStartHalfAscii => {
218	self.wtr.write_str(r"(?-u:\b{start-half})")?;
219	}
220	hir::Look::WordEndHalfAscii => {
221	self.wtr.write_str(r"(?-u:\b{end-half})")?;
222	}
223	hir::Look::WordStartHalfUnicode => {
224	self.wtr.write_str(r"\b{start-half}")?;
225	}
226	hir::Look::WordEndHalfUnicode => {
227	self.wtr.write_str(r"\b{end-half}")?;
228	}
229	},
230	HirKind::Capture(hir::Capture { ref name, .. }) => {
231	self.wtr.write_str("(")?;
232	if let Some(ref name) = *name {
233	write!(self.wtr, "?P<{}>", name)?;
234	}
235	}
236	// Why do this? Wrapping concats and alts in non-capturing groups
237	// is not always* necessary, but is sometimes necessary. For*
238	// example, 'concat(a, alt(b, c))' should be written as 'a(?:b\|c)'
239	// and not 'ab\|c'. The former is clearly the intended meaning, but
240	// the latter is actually 'alt(concat(a, b), c)'.
241	//
242	// It would be possible to only group these things in cases where
243	// it's strictly necessary, but it requires knowing the parent
244	// expression. And since this technique is simpler and always
245	// correct, we take this route. More to the point, it is a non-goal
246	// of an HIR printer to show a nice easy-to-read regex. Indeed,
247	// its construction forbids it from doing so. Therefore, inserting
248	// extra groups where they aren't necessary is perfectly okay.
249	HirKind::Concat(_) \| HirKind::Alternation(_) => {
250	self.wtr.write_str(r"(?:")?;
251	}
252	}
253	Ok(())
254	}
255
256	fn visit_post(&mut self, hir: &Hir) -> fmt::Result {
257	match *hir.kind() {
258	// Handled during visit_pre
259	HirKind::Empty
260	\| HirKind::Literal(_)
261	\| HirKind::Class(_)
262	\| HirKind::Look(_) => {}
263	HirKind::Repetition(ref x) => {
264	match (x.min, x.max) {
265	(`0`, Some(`1`)) => {
266	self.wtr.write_str("?")?;
267	}
268	(`0`, None) => {
269	self.wtr.write_str("*")?;
270	}
271	(`1`, None) => {
272	self.wtr.write_str("+")?;
273	}
274	(`1`, Some(`1`)) => {
275	// 'a{1}' and 'a{1}?' are exactly equivalent to 'a'.
276	return Ok(());
277	}
278	(m, None) => {
279	write!(self.wtr, "`{{`{},`}}`", m)?;
280	}
281	(m, Some(n)) if m == n => {
282	write!(self.wtr, "`{{`{}`}}`", m)?;
283	// a{m} and a{m}? are always exactly equivalent.
284	return Ok(());
285	}
286	(m, Some(n)) => {
287	write!(self.wtr, "`{{`{},{}`}}`", m, n)?;
288	}
289	}
290	if !x.greedy {
291	self.wtr.write_str("?")?;
292	}
293	}
294	HirKind::Capture(_)
295	\| HirKind::Concat(_)
296	\| HirKind::Alternation(_) => {
297	self.wtr.write_str(r")")?;
298	}
299	}
300	Ok(())
301	}
302
303	fn visit_alternation_in(&mut self) -> fmt::Result {
304	self.wtr.write_str("\|")
305	}
306	}
307
308	impl<W: fmt::Write> Writer<W> {
309	fn write_literal_char(&mut self, c: char) -> fmt::Result {
310	if is_meta_character(c) {
311	self.wtr.write_str("`\\`")?;
312	}
313	self.wtr.write_char(c)
314	}
315
316	fn write_literal_byte(&mut self, b: u8) -> fmt::Result {
317	if b <= `0x7F` && !b.is_ascii_control() && !b.is_ascii_whitespace() {
318	self.write_literal_char(char::try_from(b).unwrap())
319	} else {
320	write!(self.wtr, "(?-u:`\\`x{:`02`X})", b)
321	}
322	}
323
324	fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result {
325	if b <= `0x7F` && !b.is_ascii_control() && !b.is_ascii_whitespace() {
326	self.write_literal_char(char::try_from(b).unwrap())
327	} else {
328	write!(self.wtr, "`\\`x{:`02`X}", b)
329	}
330	}
331	}
332
333	#[cfg(test)]
334	mod tests {
335	use alloc::{
336	boxed::Box,
337	string::{String, ToString},
338	};
339
340	use crate::ParserBuilder;
341
342	use super::*;
343
344	fn roundtrip(given: &str, expected: &str) {
345	roundtrip_with(\|b\| b, given, expected);
346	}
347
348	fn roundtrip_bytes(given: &str, expected: &str) {
349	roundtrip_with(\|b\| b.utf8(`false`), given, expected);
350	}
351
352	fn roundtrip_with<F>(mut f: F, given: &str, expected: &str)
353	where
354	F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,
355	{
356	let mut builder = ParserBuilder::new();
357	f(&mut builder);
358	let hir = builder.build().parse(given).unwrap();
359
360	let mut printer = Printer::new();
361	let mut dst = String::new();
362	printer.print(&hir, &mut dst).unwrap();
363
364	// Check that the result is actually valid.
365	builder.build().parse(&dst).unwrap();
366
367	assert_eq!(expected, dst);
368	}
369
370	#[test]
371	fn print_literal() {
372	roundtrip("a", "a");
373	roundtrip(r"\xff", "`\u{FF}`");
374	roundtrip_bytes(r"\xff", "`\u{FF}`");
375	roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)");
376	roundtrip("☃", "☃");
377	}
378
379	#[test]
380	fn print_class() {
381	roundtrip(r"[a]", r"a");
382	roundtrip(r"[ab]", r"[ab]");
383	roundtrip(r"[a-z]", r"[a-z]");
384	roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]");
385	roundtrip(r"[^\x01-\u{10FFFF}]", "`\u{0}`");
386	roundtrip(r"[-]", r"\-");
387	roundtrip(r"[☃-⛄]", r"[☃-⛄]");
388
389	roundtrip(r"(?-u)[a]", r"a");
390	roundtrip(r"(?-u)[ab]", r"(?-u:[ab])");
391	roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])");
392	roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])");
393
394	// The following test that the printer escapes meta characters
395	// in character classes.
396	roundtrip(r"[\[]", r"\[");
397	roundtrip(r"[Z-_]", r"[Z-_]");
398	roundtrip(r"[Z-_--Z]", r"[\[-_]");
399
400	// The following test that the printer escapes meta characters
401	// in byte oriented character classes.
402	roundtrip_bytes(r"(?-u)[\[]", r"\[");
403	roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])");
404	roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])");
405
406	// This tests that an empty character class is correctly roundtripped.
407	#[cfg(feature = "unicode-gencat")]
408	roundtrip(r"\P{any}", r"[a&&b]");
409	roundtrip_bytes(r"(?-u)[^\x00-\xFF]", r"[a&&b]");
410	}
411
412	#[test]
413	fn print_anchor() {
414	roundtrip(r"^", r"\A");
415	roundtrip(r"$", r"\z");
416	roundtrip(r"(?m)^", r"(?m:^)");
417	roundtrip(r"(?m)$", r"(?m:$)");
418	}
419
420	#[test]
421	fn print_word_boundary() {
422	roundtrip(r"\b", r"\b");
423	roundtrip(r"\B", r"\B");
424	roundtrip(r"(?-u)\b", r"(?-u:\b)");
425	roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)");
426	}
427
428	#[test]
429	fn print_repetition() {
430	roundtrip("a?", "a?");
431	roundtrip("a??", "a??");
432	roundtrip("(?U)a?", "a??");
433
434	roundtrip("a", "a");
435	roundtrip("a?", "a?");
436	roundtrip("(?U)a", "a?");
437
438	roundtrip("a+", "a+");
439	roundtrip("a+?", "a+?");
440	roundtrip("(?U)a+", "a+?");
441
442	roundtrip("a{1}", "a");
443	roundtrip("a{2}", "a{2}");
444	roundtrip("a{1,}", "a+");
445	roundtrip("a{1,5}", "a{1,5}");
446	roundtrip("a{1}?", "a");
447	roundtrip("a{2}?", "a{2}");
448	roundtrip("a{1,}?", "a+?");
449	roundtrip("a{1,5}?", "a{1,5}?");
450	roundtrip("(?U)a{1}", "a");
451	roundtrip("(?U)a{2}", "a{2}");
452	roundtrip("(?U)a{1,}", "a+?");
453	roundtrip("(?U)a{1,5}", "a{1,5}?");
454
455	// Test that various zero-length repetitions always translate to an
456	// empty regex. This is more a property of HIR's smart constructors
457	// than the printer though.
458	roundtrip("a{0}", "(?:)");
459	roundtrip("(?:ab){0}", "(?:)");
460	#[cfg(feature = "unicode-gencat")]
461	{
462	roundtrip(r"\p{any}{0}", "(?:)");
463	roundtrip(r"\P{any}{0}", "(?:)");
464	}
465	}
466
467	#[test]
468	fn print_group() {
469	roundtrip("()", "((?:))");
470	roundtrip("(?P<foo>)", "(?P<foo>(?:))");
471	roundtrip("(?:)", "(?:)");
472
473	roundtrip("(a)", "(a)");
474	roundtrip("(?P<foo>a)", "(?P<foo>a)");
475	roundtrip("(?:a)", "a");
476
477	roundtrip("((((a))))", "((((a))))");
478	}
479
480	#[test]
481	fn print_alternation() {
482	roundtrip("\|", "(?:(?:)\|(?:))");
483	roundtrip("\|\|", "(?:(?:)\|(?:)\|(?:))");
484
485	roundtrip("a\|b", "[ab]");
486	roundtrip("ab\|cd", "(?:(?:ab)\|(?:cd))");
487	roundtrip("a\|b\|c", "[a-c]");
488	roundtrip("ab\|cd\|ef", "(?:(?:ab)\|(?:cd)\|(?:ef))");
489	roundtrip("foo\|bar\|quux", "(?:(?:foo)\|(?:bar)\|(?:quux))");
490	}
491
492	// This is a regression test that stresses a peculiarity of how the HIR
493	// is both constructed and printed. Namely, it is legal for a repetition
494	// to directly contain a concatenation. This particular construct isn't
495	// really possible to build from the concrete syntax directly, since you'd
496	// be forced to put the concatenation into (at least) a non-capturing
497	// group. Concurrently, the printer doesn't consider this case and just
498	// kind of naively prints the child expression and tacks on the repetition
499	// operator.
500	//
501	// As a result, if you attached '+' to a 'concat(a, b)', the printer gives
502	// you 'ab+', but clearly it really should be '(?:ab)+'.
503	//
504	// This bug isn't easy to surface because most ways of building an HIR
505	// come directly from the concrete syntax, and as mentioned above, it just
506	// isn't possible to build this kind of HIR from the concrete syntax.
507	// Nevertheless, this is definitely a bug.
508	//
509	// See: https://github.com/rust-lang/regex/issues/731
510	#[test]
511	fn regression_repetition_concat() {
512	let expr = Hir::concat(alloc::vec![
513	Hir::literal("x".as_bytes()),
514	Hir::repetition(hir::Repetition {
515	min: `1`,
516	max: None,
517	greedy: `true`,
518	sub: Box::new(Hir::literal("ab".as_bytes())),
519	}),
520	Hir::literal("y".as_bytes()),
521	]);
522	assert_eq!(r"(?:x(?:ab)+y)", expr.to_string());
523
524	let expr = Hir::concat(alloc::vec![
525	Hir::look(hir::Look::Start),
526	Hir::repetition(hir::Repetition {
527	min: `1`,
528	max: None,
529	greedy: `true`,
530	sub: Box::new(Hir::concat(alloc::vec![
531	Hir::look(hir::Look::Start),
532	Hir::look(hir::Look::End),
533	])),
534	}),
535	Hir::look(hir::Look::End),
536	]);
537	assert_eq!(r"(?:\A\A\z\z)", expr.to_string());
538	}
539
540	// Just like regression_repetition_concat, but with the repetition using
541	// an alternation as a child expression instead.
542	//
543	// See: https://github.com/rust-lang/regex/issues/731
544	#[test]
545	fn regression_repetition_alternation() {
546	let expr = Hir::concat(alloc::vec![
547	Hir::literal("ab".as_bytes()),
548	Hir::repetition(hir::Repetition {
549	min: `1`,
550	max: None,
551	greedy: `true`,
552	sub: Box::new(Hir::alternation(alloc::vec![
553	Hir::literal("cd".as_bytes()),
554	Hir::literal("ef".as_bytes()),
555	])),
556	}),
557	Hir::literal("gh".as_bytes()),
558	]);
559	assert_eq!(r"(?:(?:ab)(?:(?:cd)\|(?:ef))+(?:gh))", expr.to_string());
560
561	let expr = Hir::concat(alloc::vec![
562	Hir::look(hir::Look::Start),
563	Hir::repetition(hir::Repetition {
564	min: `1`,
565	max: None,
566	greedy: `true`,
567	sub: Box::new(Hir::alternation(alloc::vec![
568	Hir::look(hir::Look::Start),
569	Hir::look(hir::Look::End),
570	])),
571	}),
572	Hir::look(hir::Look::End),
573	]);
574	assert_eq!(r"(?:\A(?:\A\|\z)\z)", expr.to_string());
575	}
576
577	// This regression test is very similar in flavor to
578	// regression_repetition_concat in that the root of the issue lies in a
579	// peculiarity of how the HIR is represented and how the printer writes it
580	// out. Like the other regression, this one is also rooted in the fact that
581	// you can't produce the peculiar HIR from the concrete syntax. Namely, you
582	// just can't have a 'concat(a, alt(b, c))' because the 'alt' will normally
583	// be in (at least) a non-capturing group. Why? Because the '\|' has very
584	// low precedence (lower that concatenation), and so something like 'ab\|c'
585	// is actually 'alt(ab, c)'.
586	//
587	// See: https://github.com/rust-lang/regex/issues/516
588	#[test]
589	fn regression_alternation_concat() {
590	let expr = Hir::concat(alloc::vec![
591	Hir::literal("ab".as_bytes()),
592	Hir::alternation(alloc::vec![
593	Hir::literal("mn".as_bytes()),
594	Hir::literal("xy".as_bytes()),
595	]),
596	]);
597	assert_eq!(r"(?:(?:ab)(?:(?:mn)\|(?:xy)))", expr.to_string());
598
599	let expr = Hir::concat(alloc::vec![
600	Hir::look(hir::Look::Start),
601	Hir::alternation(alloc::vec![
602	Hir::look(hir::Look::Start),
603	Hir::look(hir::Look::End),
604	]),
605	]);
606	assert_eq!(r"(?:\A(?:\A\|\z))", expr.to_string());
607	}
608	}
609