1/*!
2This module provides a regular expression printer for `Hir`.
3*/
4
5use std::fmt;
6
7use crate::hir::visitor::{self, Visitor};
8use crate::hir::{self, Hir, HirKind};
9use crate::is_meta_character;
10
11/// A builder for constructing a printer.
12///
13/// Note that since a printer doesn't have any configuration knobs, this type
14/// remains unexported.
15#[derive(Clone, Debug)]
16struct PrinterBuilder {
17 _priv: (),
18}
19
20impl Default for PrinterBuilder {
21 fn default() -> PrinterBuilder {
22 PrinterBuilder::new()
23 }
24}
25
26impl PrinterBuilder {
27 fn new() -> PrinterBuilder {
28 PrinterBuilder { _priv: () }
29 }
30
31 fn build(&self) -> Printer {
32 Printer { _priv: () }
33 }
34}
35
36/// A printer for a regular expression's high-level intermediate
37/// representation.
38///
39/// A printer converts a high-level intermediate representation (HIR) to a
40/// regular expression pattern string. This particular printer uses constant
41/// stack space and heap space proportional to the size of the HIR.
42///
43/// Since this printer is only using the HIR, the pattern it prints will likely
44/// not resemble the original pattern at all. For example, a pattern like
45/// `\pL` will have its entire class written out.
46///
47/// The purpose of this printer is to provide a means to mutate an HIR and then
48/// build a regular expression from the result of that mutation. (A regex
49/// library could provide a constructor from this HIR explicitly, but that
50/// creates an unnecessary public coupling between the regex library and this
51/// specific HIR representation.)
52#[derive(Debug)]
53pub struct Printer {
54 _priv: (),
55}
56
57impl Printer {
58 /// Create a new printer.
59 pub fn new() -> Printer {
60 PrinterBuilder::new().build()
61 }
62
63 /// Print the given `Ast` to the given writer. The writer must implement
64 /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used
65 /// here are a `fmt::Formatter` (which is available in `fmt::Display`
66 /// implementations) or a `&mut String`.
67 pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result {
68 visitor::visit(hir, Writer { wtr })
69 }
70}
71
72#[derive(Debug)]
73struct Writer<W> {
74 wtr: W,
75}
76
77impl<W: fmt::Write> Visitor for Writer<W> {
78 type Output = ();
79 type Err = fmt::Error;
80
81 fn finish(self) -> fmt::Result {
82 Ok(())
83 }
84
85 fn visit_pre(&mut self, hir: &Hir) -> fmt::Result {
86 match *hir.kind() {
87 HirKind::Empty
88 | HirKind::Repetition(_)
89 | HirKind::Concat(_)
90 | HirKind::Alternation(_) => {}
91 HirKind::Literal(hir::Literal::Unicode(c)) => {
92 self.write_literal_char(c)?;
93 }
94 HirKind::Literal(hir::Literal::Byte(b)) => {
95 self.write_literal_byte(b)?;
96 }
97 HirKind::Class(hir::Class::Unicode(ref cls)) => {
98 self.wtr.write_str("[")?;
99 for range in cls.iter() {
100 if range.start() == range.end() {
101 self.write_literal_char(range.start())?;
102 } else {
103 self.write_literal_char(range.start())?;
104 self.wtr.write_str("-")?;
105 self.write_literal_char(range.end())?;
106 }
107 }
108 self.wtr.write_str("]")?;
109 }
110 HirKind::Class(hir::Class::Bytes(ref cls)) => {
111 self.wtr.write_str("(?-u:[")?;
112 for range in cls.iter() {
113 if range.start() == range.end() {
114 self.write_literal_class_byte(range.start())?;
115 } else {
116 self.write_literal_class_byte(range.start())?;
117 self.wtr.write_str("-")?;
118 self.write_literal_class_byte(range.end())?;
119 }
120 }
121 self.wtr.write_str("])")?;
122 }
123 HirKind::Anchor(hir::Anchor::StartLine) => {
124 self.wtr.write_str("(?m:^)")?;
125 }
126 HirKind::Anchor(hir::Anchor::EndLine) => {
127 self.wtr.write_str("(?m:$)")?;
128 }
129 HirKind::Anchor(hir::Anchor::StartText) => {
130 self.wtr.write_str(r"\A")?;
131 }
132 HirKind::Anchor(hir::Anchor::EndText) => {
133 self.wtr.write_str(r"\z")?;
134 }
135 HirKind::WordBoundary(hir::WordBoundary::Unicode) => {
136 self.wtr.write_str(r"\b")?;
137 }
138 HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => {
139 self.wtr.write_str(r"\B")?;
140 }
141 HirKind::WordBoundary(hir::WordBoundary::Ascii) => {
142 self.wtr.write_str(r"(?-u:\b)")?;
143 }
144 HirKind::WordBoundary(hir::WordBoundary::AsciiNegate) => {
145 self.wtr.write_str(r"(?-u:\B)")?;
146 }
147 HirKind::Group(ref x) => match x.kind {
148 hir::GroupKind::CaptureIndex(_) => {
149 self.wtr.write_str("(")?;
150 }
151 hir::GroupKind::CaptureName { ref name, .. } => {
152 write!(self.wtr, "(?P<{}>", name)?;
153 }
154 hir::GroupKind::NonCapturing => {
155 self.wtr.write_str("(?:")?;
156 }
157 },
158 }
159 Ok(())
160 }
161
162 fn visit_post(&mut self, hir: &Hir) -> fmt::Result {
163 match *hir.kind() {
164 // Handled during visit_pre
165 HirKind::Empty
166 | HirKind::Literal(_)
167 | HirKind::Class(_)
168 | HirKind::Anchor(_)
169 | HirKind::WordBoundary(_)
170 | HirKind::Concat(_)
171 | HirKind::Alternation(_) => {}
172 HirKind::Repetition(ref x) => {
173 match x.kind {
174 hir::RepetitionKind::ZeroOrOne => {
175 self.wtr.write_str("?")?;
176 }
177 hir::RepetitionKind::ZeroOrMore => {
178 self.wtr.write_str("*")?;
179 }
180 hir::RepetitionKind::OneOrMore => {
181 self.wtr.write_str("+")?;
182 }
183 hir::RepetitionKind::Range(ref x) => match *x {
184 hir::RepetitionRange::Exactly(m) => {
185 write!(self.wtr, "{{{}}}", m)?;
186 }
187 hir::RepetitionRange::AtLeast(m) => {
188 write!(self.wtr, "{{{},}}", m)?;
189 }
190 hir::RepetitionRange::Bounded(m, n) => {
191 write!(self.wtr, "{{{},{}}}", m, n)?;
192 }
193 },
194 }
195 if !x.greedy {
196 self.wtr.write_str("?")?;
197 }
198 }
199 HirKind::Group(_) => {
200 self.wtr.write_str(")")?;
201 }
202 }
203 Ok(())
204 }
205
206 fn visit_alternation_in(&mut self) -> fmt::Result {
207 self.wtr.write_str("|")
208 }
209}
210
211impl<W: fmt::Write> Writer<W> {
212 fn write_literal_char(&mut self, c: char) -> fmt::Result {
213 if is_meta_character(c) {
214 self.wtr.write_str("\\")?;
215 }
216 self.wtr.write_char(c)
217 }
218
219 fn write_literal_byte(&mut self, b: u8) -> fmt::Result {
220 let c = b as char;
221 if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
222 self.write_literal_char(c)
223 } else {
224 write!(self.wtr, "(?-u:\\x{:02X})", b)
225 }
226 }
227
228 fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result {
229 let c = b as char;
230 if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
231 self.write_literal_char(c)
232 } else {
233 write!(self.wtr, "\\x{:02X}", b)
234 }
235 }
236}
237
238#[cfg(test)]
239mod tests {
240 use super::Printer;
241 use crate::ParserBuilder;
242
243 fn roundtrip(given: &str, expected: &str) {
244 roundtrip_with(|b| b, given, expected);
245 }
246
247 fn roundtrip_bytes(given: &str, expected: &str) {
248 roundtrip_with(|b| b.allow_invalid_utf8(true), given, expected);
249 }
250
251 fn roundtrip_with<F>(mut f: F, given: &str, expected: &str)
252 where
253 F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,
254 {
255 let mut builder = ParserBuilder::new();
256 f(&mut builder);
257 let hir = builder.build().parse(given).unwrap();
258
259 let mut printer = Printer::new();
260 let mut dst = String::new();
261 printer.print(&hir, &mut dst).unwrap();
262
263 // Check that the result is actually valid.
264 builder.build().parse(&dst).unwrap();
265
266 assert_eq!(expected, dst);
267 }
268
269 #[test]
270 fn print_literal() {
271 roundtrip("a", "a");
272 roundtrip(r"\xff", "\u{FF}");
273 roundtrip_bytes(r"\xff", "\u{FF}");
274 roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)");
275 roundtrip("☃", "☃");
276 }
277
278 #[test]
279 fn print_class() {
280 roundtrip(r"[a]", r"[a]");
281 roundtrip(r"[a-z]", r"[a-z]");
282 roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]");
283 roundtrip(r"[^\x01-\u{10FFFF}]", "[\u{0}]");
284 roundtrip(r"[-]", r"[\-]");
285 roundtrip(r"[☃-⛄]", r"[☃-⛄]");
286
287 roundtrip(r"(?-u)[a]", r"(?-u:[a])");
288 roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])");
289 roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])");
290
291 // The following test that the printer escapes meta characters
292 // in character classes.
293 roundtrip(r"[\[]", r"[\[]");
294 roundtrip(r"[Z-_]", r"[Z-_]");
295 roundtrip(r"[Z-_--Z]", r"[\[-_]");
296
297 // The following test that the printer escapes meta characters
298 // in byte oriented character classes.
299 roundtrip_bytes(r"(?-u)[\[]", r"(?-u:[\[])");
300 roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])");
301 roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])");
302 }
303
304 #[test]
305 fn print_anchor() {
306 roundtrip(r"^", r"\A");
307 roundtrip(r"$", r"\z");
308 roundtrip(r"(?m)^", r"(?m:^)");
309 roundtrip(r"(?m)$", r"(?m:$)");
310 }
311
312 #[test]
313 fn print_word_boundary() {
314 roundtrip(r"\b", r"\b");
315 roundtrip(r"\B", r"\B");
316 roundtrip(r"(?-u)\b", r"(?-u:\b)");
317 roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)");
318 }
319
320 #[test]
321 fn print_repetition() {
322 roundtrip("a?", "a?");
323 roundtrip("a??", "a??");
324 roundtrip("(?U)a?", "a??");
325
326 roundtrip("a*", "a*");
327 roundtrip("a*?", "a*?");
328 roundtrip("(?U)a*", "a*?");
329
330 roundtrip("a+", "a+");
331 roundtrip("a+?", "a+?");
332 roundtrip("(?U)a+", "a+?");
333
334 roundtrip("a{1}", "a{1}");
335 roundtrip("a{1,}", "a{1,}");
336 roundtrip("a{1,5}", "a{1,5}");
337 roundtrip("a{1}?", "a{1}?");
338 roundtrip("a{1,}?", "a{1,}?");
339 roundtrip("a{1,5}?", "a{1,5}?");
340 roundtrip("(?U)a{1}", "a{1}?");
341 roundtrip("(?U)a{1,}", "a{1,}?");
342 roundtrip("(?U)a{1,5}", "a{1,5}?");
343 }
344
345 #[test]
346 fn print_group() {
347 roundtrip("()", "()");
348 roundtrip("(?P<foo>)", "(?P<foo>)");
349 roundtrip("(?:)", "(?:)");
350
351 roundtrip("(a)", "(a)");
352 roundtrip("(?P<foo>a)", "(?P<foo>a)");
353 roundtrip("(?:a)", "(?:a)");
354
355 roundtrip("((((a))))", "((((a))))");
356 }
357
358 #[test]
359 fn print_alternation() {
360 roundtrip("|", "|");
361 roundtrip("||", "||");
362
363 roundtrip("a|b", "a|b");
364 roundtrip("a|b|c", "a|b|c");
365 roundtrip("foo|bar|quux", "foo|bar|quux");
366 }
367}
368