print.rs source code [crates/regex-syntax-0.7.2/src/hir/print.rs]

1	/!*
2	This module provides a regular expression printer for `Hir`.
3	*/
4
5	use core::fmt;
6
7	use crate::{
8	hir::{
9	self,
10	visitor::{self, Visitor},
11	Hir, HirKind,
12	},
13	is_meta_character,
14	};
15
16	/// A builder for constructing a printer.
17	///
18	/// Note that since a printer doesn't have any configuration knobs, this type
19	/// remains unexported.
20	#[derive(Clone, Debug)]
21	struct PrinterBuilder {
22	_priv: (),
23	}
24
25	impl Default for PrinterBuilder {
26	fn default() -> PrinterBuilder {
27	PrinterBuilder::new()
28	}
29	}
30
31	impl PrinterBuilder {
32	fn new() -> PrinterBuilder {
33	PrinterBuilder { _priv: () }
34	}
35
36	fn build(&self) -> Printer {
37	Printer { _priv: () }
38	}
39	}
40
41	/// A printer for a regular expression's high-level intermediate
42	/// representation.
43	///
44	/// A printer converts a high-level intermediate representation (HIR) to a
45	/// regular expression pattern string. This particular printer uses constant
46	/// stack space and heap space proportional to the size of the HIR.
47	///
48	/// Since this printer is only using the HIR, the pattern it prints will likely
49	/// not resemble the original pattern at all. For example, a pattern like
50	/// `\pL` will have its entire class written out.
51	///
52	/// The purpose of this printer is to provide a means to mutate an HIR and then
53	/// build a regular expression from the result of that mutation. (A regex
54	/// library could provide a constructor from this HIR explicitly, but that
55	/// creates an unnecessary public coupling between the regex library and this
56	/// specific HIR representation.)
57	#[derive(Debug)]
58	pub struct Printer {
59	_priv: (),
60	}
61
62	impl Printer {
63	/// Create a new printer.
64	pub fn new() -> Printer {
65	PrinterBuilder::new().build()
66	}
67
68	/// Print the given `Ast` to the given writer. The writer must implement
69	/// `fmt::Write`. Typical implementations of `fmt::Write` that can be used
70	/// here are a `fmt::Formatter` (which is available in `fmt::Display`
71	/// implementations) or a `&mut String`.
72	pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result {
73	visitor::visit(hir, visitor:Writer { wtr })
74	}
75	}
76
77	#[derive(Debug)]
78	struct Writer<W> {
79	wtr: W,
80	}
81
82	impl<W: fmt::Write> Visitor for Writer<W> {
83	type Output = ();
84	type Err = fmt::Error;
85
86	fn finish(self) -> fmt::Result {
87	Ok(())
88	}
89
90	fn visit_pre(&mut self, hir: &Hir) -> fmt::Result {
91	match *hir.kind() {
92	// Empty is represented by nothing in the concrete syntax, and
93	// repetition operators are strictly suffix oriented.
94	HirKind::Empty \| HirKind::Repetition(_) => {}
95	HirKind::Literal(hir::Literal(ref bytes)) => {
96	// See the comment on the 'Concat' and 'Alternation' case below
97	// for why we put parens here. Literals are, conceptually,
98	// a special case of concatenation where each element is a
99	// character. The HIR flattens this into a Box<[u8]>, but we
100	// still need to treat it like a concatenation for correct
101	// printing. As a special case, we don't write parens if there
102	// is only one character. One character means there is no
103	// concat so we don't need parens. Adding parens would still be
104	// correct, but we drop them here because it tends to create
105	// rather noisy regexes even in simple cases.
106	let result = core::str::from_utf8(bytes);
107	let len = result.map_or(bytes.len(), \|s\| s.chars().count());
108	if len > `1` {
109	self.wtr.write_str(r"(?:")?;
110	}
111	match result {
112	Ok(string) => {
113	for c in string.chars() {
114	self.write_literal_char(c)?;
115	}
116	}
117	Err(_) => {
118	for &b in bytes.iter() {
119	self.write_literal_byte(b)?;
120	}
121	}
122	}
123	if len > `1` {
124	self.wtr.write_str(r")")?;
125	}
126	}
127	HirKind::Class(hir::Class::Unicode(ref cls)) => {
128	if cls.ranges().is_empty() {
129	return self.wtr.write_str("[a&&b]");
130	}
131	self.wtr.write_str("[")?;
132	for range in cls.iter() {
133	if range.start() == range.end() {
134	self.write_literal_char(range.start())?;
135	} else if u32::from(range.start()) + `1`
136	== u32::from(range.end())
137	{
138	self.write_literal_char(range.start())?;
139	self.write_literal_char(range.end())?;
140	} else {
141	self.write_literal_char(range.start())?;
142	self.wtr.write_str("-")?;
143	self.write_literal_char(range.end())?;
144	}
145	}
146	self.wtr.write_str("]")?;
147	}
148	HirKind::Class(hir::Class::Bytes(ref cls)) => {
149	if cls.ranges().is_empty() {
150	return self.wtr.write_str("[a&&b]");
151	}
152	self.wtr.write_str("(?-u:[")?;
153	for range in cls.iter() {
154	if range.start() == range.end() {
155	self.write_literal_class_byte(range.start())?;
156	} else if range.start() + `1` == range.end() {
157	self.write_literal_class_byte(range.start())?;
158	self.write_literal_class_byte(range.end())?;
159	} else {
160	self.write_literal_class_byte(range.start())?;
161	self.wtr.write_str("-")?;
162	self.write_literal_class_byte(range.end())?;
163	}
164	}
165	self.wtr.write_str("])")?;
166	}
167	HirKind::Look(ref look) => match *look {
168	hir::Look::Start => {
169	self.wtr.write_str(r"\A")?;
170	}
171	hir::Look::End => {
172	self.wtr.write_str(r"\z")?;
173	}
174	hir::Look::StartLF => {
175	self.wtr.write_str("(?m:^)")?;
176	}
177	hir::Look::EndLF => {
178	self.wtr.write_str("(?m:$)")?;
179	}
180	hir::Look::StartCRLF => {
181	self.wtr.write_str("(?mR:^)")?;
182	}
183	hir::Look::EndCRLF => {
184	self.wtr.write_str("(?mR:$)")?;
185	}
186	hir::Look::WordAscii => {
187	self.wtr.write_str(r"(?-u:\b)")?;
188	}
189	hir::Look::WordAsciiNegate => {
190	self.wtr.write_str(r"(?-u:\B)")?;
191	}
192	hir::Look::WordUnicode => {
193	self.wtr.write_str(r"\b")?;
194	}
195	hir::Look::WordUnicodeNegate => {
196	self.wtr.write_str(r"\B")?;
197	}
198	},
199	HirKind::Capture(hir::Capture { ref name, .. }) => {
200	self.wtr.write_str("(")?;
201	if let Some(ref name) = *name {
202	write!(self.wtr, "?P<{}>", name)?;
203	}
204	}
205	// Why do this? Wrapping concats and alts in non-capturing groups
206	// is not always* necessary, but is sometimes necessary. For*
207	// example, 'concat(a, alt(b, c))' should be written as 'a(?:b\|c)'
208	// and not 'ab\|c'. The former is clearly the intended meaning, but
209	// the latter is actually 'alt(concat(a, b), c)'.
210	//
211	// It would be possible to only group these things in cases where
212	// it's strictly necessary, but it requires knowing the parent
213	// expression. And since this technique is simpler and always
214	// correct, we take this route. More to the point, it is a non-goal
215	// of an HIR printer to show a nice easy-to-read regex. Indeed,
216	// its construction forbids it from doing so. Therefore, inserting
217	// extra groups where they aren't necessary is perfectly okay.
218	HirKind::Concat(_) \| HirKind::Alternation(_) => {
219	self.wtr.write_str(r"(?:")?;
220	}
221	}
222	Ok(())
223	}
224
225	fn visit_post(&mut self, hir: &Hir) -> fmt::Result {
226	match *hir.kind() {
227	// Handled during visit_pre
228	HirKind::Empty
229	\| HirKind::Literal(_)
230	\| HirKind::Class(_)
231	\| HirKind::Look(_) => {}
232	HirKind::Repetition(ref x) => {
233	match (x.min, x.max) {
234	(`0`, Some(`1`)) => {
235	self.wtr.write_str("?")?;
236	}
237	(`0`, None) => {
238	self.wtr.write_str("*")?;
239	}
240	(`1`, None) => {
241	self.wtr.write_str("+")?;
242	}
243	(`1`, Some(`1`)) => {
244	// 'a{1}' and 'a{1}?' are exactly equivalent to 'a'.
245	return Ok(());
246	}
247	(m, None) => {
248	write!(self.wtr, "`{{`{},`}}`", m)?;
249	}
250	(m, Some(n)) if m == n => {
251	write!(self.wtr, "`{{`{}`}}`", m)?;
252	// a{m} and a{m}? are always exactly equivalent.
253	return Ok(());
254	}
255	(m, Some(n)) => {
256	write!(self.wtr, "`{{`{},{}`}}`", m, n)?;
257	}
258	}
259	if !x.greedy {
260	self.wtr.write_str("?")?;
261	}
262	}
263	HirKind::Capture(_)
264	\| HirKind::Concat(_)
265	\| HirKind::Alternation(_) => {
266	self.wtr.write_str(r")")?;
267	}
268	}
269	Ok(())
270	}
271
272	fn visit_alternation_in(&mut self) -> fmt::Result {
273	self.wtr.write_str("\|")
274	}
275	}
276
277	impl<W: fmt::Write> Writer<W> {
278	fn write_literal_char(&mut self, c: char) -> fmt::Result {
279	if is_meta_character(c) {
280	self.wtr.write_str("`\\`")?;
281	}
282	self.wtr.write_char(c)
283	}
284
285	fn write_literal_byte(&mut self, b: u8) -> fmt::Result {
286	if b <= `0x7F` && !b.is_ascii_control() && !b.is_ascii_whitespace() {
287	self.write_literal_char(char::try_from(b).unwrap())
288	} else {
289	write!(self.wtr, "(?-u:`\\`x{:`02`X})", b)
290	}
291	}
292
293	fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result {
294	if b <= `0x7F` && !b.is_ascii_control() && !b.is_ascii_whitespace() {
295	self.write_literal_char(char::try_from(b).unwrap())
296	} else {
297	write!(self.wtr, "`\\`x{:`02`X}", b)
298	}
299	}
300	}
301
302	#[cfg(test)]
303	mod tests {
304	use alloc::{
305	boxed::Box,
306	string::{String, ToString},
307	};
308
309	use crate::ParserBuilder;
310
311	use super::*;
312
313	fn roundtrip(given: &str, expected: &str) {
314	roundtrip_with(\|b\| b, given, expected);
315	}
316
317	fn roundtrip_bytes(given: &str, expected: &str) {
318	roundtrip_with(\|b\| b.utf8(`false`), given, expected);
319	}
320
321	fn roundtrip_with<F>(mut f: F, given: &str, expected: &str)
322	where
323	F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,
324	{
325	let mut builder = ParserBuilder::new();
326	f(&mut builder);
327	let hir = builder.build().parse(given).unwrap();
328
329	let mut printer = Printer::new();
330	let mut dst = String::new();
331	printer.print(&hir, &mut dst).unwrap();
332
333	// Check that the result is actually valid.
334	builder.build().parse(&dst).unwrap();
335
336	assert_eq!(expected, dst);
337	}
338
339	#[test]
340	fn print_literal() {
341	roundtrip("a", "a");
342	roundtrip(r"\xff", "`\u{FF}`");
343	roundtrip_bytes(r"\xff", "`\u{FF}`");
344	roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)");
345	roundtrip("☃", "☃");
346	}
347
348	#[test]
349	fn print_class() {
350	roundtrip(r"[a]", r"a");
351	roundtrip(r"[ab]", r"[ab]");
352	roundtrip(r"[a-z]", r"[a-z]");
353	roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]");
354	roundtrip(r"[^\x01-\u{10FFFF}]", "`\u{0}`");
355	roundtrip(r"[-]", r"\-");
356	roundtrip(r"[☃-⛄]", r"[☃-⛄]");
357
358	roundtrip(r"(?-u)[a]", r"a");
359	roundtrip(r"(?-u)[ab]", r"(?-u:[ab])");
360	roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])");
361	roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])");
362
363	// The following test that the printer escapes meta characters
364	// in character classes.
365	roundtrip(r"[\[]", r"\[");
366	roundtrip(r"[Z-_]", r"[Z-_]");
367	roundtrip(r"[Z-_--Z]", r"[\[-_]");
368
369	// The following test that the printer escapes meta characters
370	// in byte oriented character classes.
371	roundtrip_bytes(r"(?-u)[\[]", r"\[");
372	roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])");
373	roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])");
374
375	// This tests that an empty character class is correctly roundtripped.
376	#[cfg(feature = "unicode-gencat")]
377	roundtrip(r"\P{any}", r"[a&&b]");
378	roundtrip_bytes(r"(?-u)[^\x00-\xFF]", r"[a&&b]");
379	}
380
381	#[test]
382	fn print_anchor() {
383	roundtrip(r"^", r"\A");
384	roundtrip(r"$", r"\z");
385	roundtrip(r"(?m)^", r"(?m:^)");
386	roundtrip(r"(?m)$", r"(?m:$)");
387	}
388
389	#[test]
390	fn print_word_boundary() {
391	roundtrip(r"\b", r"\b");
392	roundtrip(r"\B", r"\B");
393	roundtrip(r"(?-u)\b", r"(?-u:\b)");
394	roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)");
395	}
396
397	#[test]
398	fn print_repetition() {
399	roundtrip("a?", "a?");
400	roundtrip("a??", "a??");
401	roundtrip("(?U)a?", "a??");
402
403	roundtrip("a", "a");
404	roundtrip("a?", "a?");
405	roundtrip("(?U)a", "a?");
406
407	roundtrip("a+", "a+");
408	roundtrip("a+?", "a+?");
409	roundtrip("(?U)a+", "a+?");
410
411	roundtrip("a{1}", "a");
412	roundtrip("a{2}", "a{2}");
413	roundtrip("a{1,}", "a+");
414	roundtrip("a{1,5}", "a{1,5}");
415	roundtrip("a{1}?", "a");
416	roundtrip("a{2}?", "a{2}");
417	roundtrip("a{1,}?", "a+?");
418	roundtrip("a{1,5}?", "a{1,5}?");
419	roundtrip("(?U)a{1}", "a");
420	roundtrip("(?U)a{2}", "a{2}");
421	roundtrip("(?U)a{1,}", "a+?");
422	roundtrip("(?U)a{1,5}", "a{1,5}?");
423
424	// Test that various zero-length repetitions always translate to an
425	// empty regex. This is more a property of HIR's smart constructors
426	// than the printer though.
427	roundtrip("a{0}", "");
428	roundtrip("(?:ab){0}", "");
429	#[cfg(feature = "unicode-gencat")]
430	{
431	roundtrip(r"\p{any}{0}", "");
432	roundtrip(r"\P{any}{0}", "");
433	}
434	}
435
436	#[test]
437	fn print_group() {
438	roundtrip("()", "()");
439	roundtrip("(?P<foo>)", "(?P<foo>)");
440	roundtrip("(?:)", "");
441
442	roundtrip("(a)", "(a)");
443	roundtrip("(?P<foo>a)", "(?P<foo>a)");
444	roundtrip("(?:a)", "a");
445
446	roundtrip("((((a))))", "((((a))))");
447	}
448
449	#[test]
450	fn print_alternation() {
451	roundtrip("\|", "(?:\|)");
452	roundtrip("\|\|", "(?:\|\|)");
453
454	roundtrip("a\|b", "[ab]");
455	roundtrip("ab\|cd", "(?:(?:ab)\|(?:cd))");
456	roundtrip("a\|b\|c", "[a-c]");
457	roundtrip("ab\|cd\|ef", "(?:(?:ab)\|(?:cd)\|(?:ef))");
458	roundtrip("foo\|bar\|quux", "(?:(?:foo)\|(?:bar)\|(?:quux))");
459	}
460
461	// This is a regression test that stresses a peculiarity of how the HIR
462	// is both constructed and printed. Namely, it is legal for a repetition
463	// to directly contain a concatenation. This particular construct isn't
464	// really possible to build from the concrete syntax directly, since you'd
465	// be forced to put the concatenation into (at least) a non-capturing
466	// group. Concurrently, the printer doesn't consider this case and just
467	// kind of naively prints the child expression and tacks on the repetition
468	// operator.
469	//
470	// As a result, if you attached '+' to a 'concat(a, b)', the printer gives
471	// you 'ab+', but clearly it really should be '(?:ab)+'.
472	//
473	// This bug isn't easy to surface because most ways of building an HIR
474	// come directly from the concrete syntax, and as mentioned above, it just
475	// isn't possible to build this kind of HIR from the concrete syntax.
476	// Nevertheless, this is definitely a bug.
477	//
478	// See: https://github.com/rust-lang/regex/issues/731
479	#[test]
480	fn regression_repetition_concat() {
481	let expr = Hir::concat(alloc::vec![
482	Hir::literal("x".as_bytes()),
483	Hir::repetition(hir::Repetition {
484	min: `1`,
485	max: None,
486	greedy: `true`,
487	sub: Box::new(Hir::literal("ab".as_bytes())),
488	}),
489	Hir::literal("y".as_bytes()),
490	]);
491	assert_eq!(r"(?:x(?:ab)+y)", expr.to_string());
492
493	let expr = Hir::concat(alloc::vec![
494	Hir::look(hir::Look::Start),
495	Hir::repetition(hir::Repetition {
496	min: `1`,
497	max: None,
498	greedy: `true`,
499	sub: Box::new(Hir::concat(alloc::vec![
500	Hir::look(hir::Look::Start),
501	Hir::look(hir::Look::End),
502	])),
503	}),
504	Hir::look(hir::Look::End),
505	]);
506	assert_eq!(r"(?:\A(?:\A\z)+\z)", expr.to_string());
507	}
508
509	// Just like regression_repetition_concat, but with the repetition using
510	// an alternation as a child expression instead.
511	//
512	// See: https://github.com/rust-lang/regex/issues/731
513	#[test]
514	fn regression_repetition_alternation() {
515	let expr = Hir::concat(alloc::vec![
516	Hir::literal("ab".as_bytes()),
517	Hir::repetition(hir::Repetition {
518	min: `1`,
519	max: None,
520	greedy: `true`,
521	sub: Box::new(Hir::alternation(alloc::vec![
522	Hir::literal("cd".as_bytes()),
523	Hir::literal("ef".as_bytes()),
524	])),
525	}),
526	Hir::literal("gh".as_bytes()),
527	]);
528	assert_eq!(r"(?:(?:ab)(?:(?:cd)\|(?:ef))+(?:gh))", expr.to_string());
529
530	let expr = Hir::concat(alloc::vec![
531	Hir::look(hir::Look::Start),
532	Hir::repetition(hir::Repetition {
533	min: `1`,
534	max: None,
535	greedy: `true`,
536	sub: Box::new(Hir::alternation(alloc::vec![
537	Hir::look(hir::Look::Start),
538	Hir::look(hir::Look::End),
539	])),
540	}),
541	Hir::look(hir::Look::End),
542	]);
543	assert_eq!(r"(?:\A(?:\A\|\z)+\z)", expr.to_string());
544	}
545
546	// This regression test is very similar in flavor to
547	// regression_repetition_concat in that the root of the issue lies in a
548	// peculiarity of how the HIR is represented and how the printer writes it
549	// out. Like the other regression, this one is also rooted in the fact that
550	// you can't produce the peculiar HIR from the concrete syntax. Namely, you
551	// just can't have a 'concat(a, alt(b, c))' because the 'alt' will normally
552	// be in (at least) a non-capturing group. Why? Because the '\|' has very
553	// low precedence (lower that concatenation), and so something like 'ab\|c'
554	// is actually 'alt(ab, c)'.
555	//
556	// See: https://github.com/rust-lang/regex/issues/516
557	#[test]
558	fn regression_alternation_concat() {
559	let expr = Hir::concat(alloc::vec![
560	Hir::literal("ab".as_bytes()),
561	Hir::alternation(alloc::vec![
562	Hir::literal("mn".as_bytes()),
563	Hir::literal("xy".as_bytes()),
564	]),
565	]);
566	assert_eq!(r"(?:(?:ab)(?:(?:mn)\|(?:xy)))", expr.to_string());
567
568	let expr = Hir::concat(alloc::vec![
569	Hir::look(hir::Look::Start),
570	Hir::alternation(alloc::vec![
571	Hir::look(hir::Look::Start),
572	Hir::look(hir::Look::End),
573	]),
574	]);
575	assert_eq!(r"(?:\A(?:\A\|\z))", expr.to_string());
576	}
577	}
578