1 | /*! |
2 | This module provides a regular expression printer for `Hir`. |
3 | */ |
4 | |
5 | use std::fmt; |
6 | |
7 | use crate::hir::visitor::{self, Visitor}; |
8 | use crate::hir::{self, Hir, HirKind}; |
9 | use crate::is_meta_character; |
10 | |
11 | /// A builder for constructing a printer. |
12 | /// |
13 | /// Note that since a printer doesn't have any configuration knobs, this type |
14 | /// remains unexported. |
15 | #[derive(Clone, Debug)] |
16 | struct PrinterBuilder { |
17 | _priv: (), |
18 | } |
19 | |
20 | impl Default for PrinterBuilder { |
21 | fn default() -> PrinterBuilder { |
22 | PrinterBuilder::new() |
23 | } |
24 | } |
25 | |
26 | impl PrinterBuilder { |
27 | fn new() -> PrinterBuilder { |
28 | PrinterBuilder { _priv: () } |
29 | } |
30 | |
31 | fn build(&self) -> Printer { |
32 | Printer { _priv: () } |
33 | } |
34 | } |
35 | |
36 | /// A printer for a regular expression's high-level intermediate |
37 | /// representation. |
38 | /// |
39 | /// A printer converts a high-level intermediate representation (HIR) to a |
40 | /// regular expression pattern string. This particular printer uses constant |
41 | /// stack space and heap space proportional to the size of the HIR. |
42 | /// |
43 | /// Since this printer is only using the HIR, the pattern it prints will likely |
44 | /// not resemble the original pattern at all. For example, a pattern like |
45 | /// `\pL` will have its entire class written out. |
46 | /// |
47 | /// The purpose of this printer is to provide a means to mutate an HIR and then |
48 | /// build a regular expression from the result of that mutation. (A regex |
49 | /// library could provide a constructor from this HIR explicitly, but that |
50 | /// creates an unnecessary public coupling between the regex library and this |
51 | /// specific HIR representation.) |
52 | #[derive(Debug)] |
53 | pub struct Printer { |
54 | _priv: (), |
55 | } |
56 | |
57 | impl Printer { |
58 | /// Create a new printer. |
59 | pub fn new() -> Printer { |
60 | PrinterBuilder::new().build() |
61 | } |
62 | |
63 | /// Print the given `Ast` to the given writer. The writer must implement |
64 | /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used |
65 | /// here are a `fmt::Formatter` (which is available in `fmt::Display` |
66 | /// implementations) or a `&mut String`. |
67 | pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result { |
68 | visitor::visit(hir, Writer { wtr }) |
69 | } |
70 | } |
71 | |
72 | #[derive(Debug)] |
73 | struct Writer<W> { |
74 | wtr: W, |
75 | } |
76 | |
77 | impl<W: fmt::Write> Visitor for Writer<W> { |
78 | type Output = (); |
79 | type Err = fmt::Error; |
80 | |
81 | fn finish(self) -> fmt::Result { |
82 | Ok(()) |
83 | } |
84 | |
85 | fn visit_pre(&mut self, hir: &Hir) -> fmt::Result { |
86 | match *hir.kind() { |
87 | HirKind::Empty |
88 | | HirKind::Repetition(_) |
89 | | HirKind::Concat(_) |
90 | | HirKind::Alternation(_) => {} |
91 | HirKind::Literal(hir::Literal::Unicode(c)) => { |
92 | self.write_literal_char(c)?; |
93 | } |
94 | HirKind::Literal(hir::Literal::Byte(b)) => { |
95 | self.write_literal_byte(b)?; |
96 | } |
97 | HirKind::Class(hir::Class::Unicode(ref cls)) => { |
98 | self.wtr.write_str("[" )?; |
99 | for range in cls.iter() { |
100 | if range.start() == range.end() { |
101 | self.write_literal_char(range.start())?; |
102 | } else { |
103 | self.write_literal_char(range.start())?; |
104 | self.wtr.write_str("-" )?; |
105 | self.write_literal_char(range.end())?; |
106 | } |
107 | } |
108 | self.wtr.write_str("]" )?; |
109 | } |
110 | HirKind::Class(hir::Class::Bytes(ref cls)) => { |
111 | self.wtr.write_str("(?-u:[" )?; |
112 | for range in cls.iter() { |
113 | if range.start() == range.end() { |
114 | self.write_literal_class_byte(range.start())?; |
115 | } else { |
116 | self.write_literal_class_byte(range.start())?; |
117 | self.wtr.write_str("-" )?; |
118 | self.write_literal_class_byte(range.end())?; |
119 | } |
120 | } |
121 | self.wtr.write_str("])" )?; |
122 | } |
123 | HirKind::Anchor(hir::Anchor::StartLine) => { |
124 | self.wtr.write_str("(?m:^)" )?; |
125 | } |
126 | HirKind::Anchor(hir::Anchor::EndLine) => { |
127 | self.wtr.write_str("(?m:$)" )?; |
128 | } |
129 | HirKind::Anchor(hir::Anchor::StartText) => { |
130 | self.wtr.write_str(r"\A" )?; |
131 | } |
132 | HirKind::Anchor(hir::Anchor::EndText) => { |
133 | self.wtr.write_str(r"\z" )?; |
134 | } |
135 | HirKind::WordBoundary(hir::WordBoundary::Unicode) => { |
136 | self.wtr.write_str(r"\b" )?; |
137 | } |
138 | HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => { |
139 | self.wtr.write_str(r"\B" )?; |
140 | } |
141 | HirKind::WordBoundary(hir::WordBoundary::Ascii) => { |
142 | self.wtr.write_str(r"(?-u:\b)" )?; |
143 | } |
144 | HirKind::WordBoundary(hir::WordBoundary::AsciiNegate) => { |
145 | self.wtr.write_str(r"(?-u:\B)" )?; |
146 | } |
147 | HirKind::Group(ref x) => match x.kind { |
148 | hir::GroupKind::CaptureIndex(_) => { |
149 | self.wtr.write_str("(" )?; |
150 | } |
151 | hir::GroupKind::CaptureName { ref name, .. } => { |
152 | write!(self.wtr, "(?P<{}>" , name)?; |
153 | } |
154 | hir::GroupKind::NonCapturing => { |
155 | self.wtr.write_str("(?:" )?; |
156 | } |
157 | }, |
158 | } |
159 | Ok(()) |
160 | } |
161 | |
162 | fn visit_post(&mut self, hir: &Hir) -> fmt::Result { |
163 | match *hir.kind() { |
164 | // Handled during visit_pre |
165 | HirKind::Empty |
166 | | HirKind::Literal(_) |
167 | | HirKind::Class(_) |
168 | | HirKind::Anchor(_) |
169 | | HirKind::WordBoundary(_) |
170 | | HirKind::Concat(_) |
171 | | HirKind::Alternation(_) => {} |
172 | HirKind::Repetition(ref x) => { |
173 | match x.kind { |
174 | hir::RepetitionKind::ZeroOrOne => { |
175 | self.wtr.write_str("?" )?; |
176 | } |
177 | hir::RepetitionKind::ZeroOrMore => { |
178 | self.wtr.write_str("*" )?; |
179 | } |
180 | hir::RepetitionKind::OneOrMore => { |
181 | self.wtr.write_str("+" )?; |
182 | } |
183 | hir::RepetitionKind::Range(ref x) => match *x { |
184 | hir::RepetitionRange::Exactly(m) => { |
185 | write!(self.wtr, "{{{}}}" , m)?; |
186 | } |
187 | hir::RepetitionRange::AtLeast(m) => { |
188 | write!(self.wtr, "{{{},}}" , m)?; |
189 | } |
190 | hir::RepetitionRange::Bounded(m, n) => { |
191 | write!(self.wtr, "{{{},{}}}" , m, n)?; |
192 | } |
193 | }, |
194 | } |
195 | if !x.greedy { |
196 | self.wtr.write_str("?" )?; |
197 | } |
198 | } |
199 | HirKind::Group(_) => { |
200 | self.wtr.write_str(")" )?; |
201 | } |
202 | } |
203 | Ok(()) |
204 | } |
205 | |
206 | fn visit_alternation_in(&mut self) -> fmt::Result { |
207 | self.wtr.write_str("|" ) |
208 | } |
209 | } |
210 | |
211 | impl<W: fmt::Write> Writer<W> { |
212 | fn write_literal_char(&mut self, c: char) -> fmt::Result { |
213 | if is_meta_character(c) { |
214 | self.wtr.write_str(" \\" )?; |
215 | } |
216 | self.wtr.write_char(c) |
217 | } |
218 | |
219 | fn write_literal_byte(&mut self, b: u8) -> fmt::Result { |
220 | let c = b as char; |
221 | if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() { |
222 | self.write_literal_char(c) |
223 | } else { |
224 | write!(self.wtr, "(?-u: \\x{:02X})" , b) |
225 | } |
226 | } |
227 | |
228 | fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result { |
229 | let c = b as char; |
230 | if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() { |
231 | self.write_literal_char(c) |
232 | } else { |
233 | write!(self.wtr, " \\x{:02X}" , b) |
234 | } |
235 | } |
236 | } |
237 | |
238 | #[cfg (test)] |
239 | mod tests { |
240 | use super::Printer; |
241 | use crate::ParserBuilder; |
242 | |
243 | fn roundtrip(given: &str, expected: &str) { |
244 | roundtrip_with(|b| b, given, expected); |
245 | } |
246 | |
247 | fn roundtrip_bytes(given: &str, expected: &str) { |
248 | roundtrip_with(|b| b.allow_invalid_utf8(true), given, expected); |
249 | } |
250 | |
251 | fn roundtrip_with<F>(mut f: F, given: &str, expected: &str) |
252 | where |
253 | F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder, |
254 | { |
255 | let mut builder = ParserBuilder::new(); |
256 | f(&mut builder); |
257 | let hir = builder.build().parse(given).unwrap(); |
258 | |
259 | let mut printer = Printer::new(); |
260 | let mut dst = String::new(); |
261 | printer.print(&hir, &mut dst).unwrap(); |
262 | |
263 | // Check that the result is actually valid. |
264 | builder.build().parse(&dst).unwrap(); |
265 | |
266 | assert_eq!(expected, dst); |
267 | } |
268 | |
269 | #[test] |
270 | fn print_literal() { |
271 | roundtrip("a" , "a" ); |
272 | roundtrip(r"\xff" , " \u{FF}" ); |
273 | roundtrip_bytes(r"\xff" , " \u{FF}" ); |
274 | roundtrip_bytes(r"(?-u)\xff" , r"(?-u:\xFF)" ); |
275 | roundtrip("☃" , "☃" ); |
276 | } |
277 | |
278 | #[test] |
279 | fn print_class() { |
280 | roundtrip(r"[a]" , r"[a]" ); |
281 | roundtrip(r"[a-z]" , r"[a-z]" ); |
282 | roundtrip(r"[a-z--b-c--x-y]" , r"[ad-wz]" ); |
283 | roundtrip(r"[^\x01-\u{10FFFF}]" , "[ \u{0}]" ); |
284 | roundtrip(r"[-]" , r"[\-]" ); |
285 | roundtrip(r"[☃-⛄]" , r"[☃-⛄]" ); |
286 | |
287 | roundtrip(r"(?-u)[a]" , r"(?-u:[a])" ); |
288 | roundtrip(r"(?-u)[a-z]" , r"(?-u:[a-z])" ); |
289 | roundtrip_bytes(r"(?-u)[a-\xFF]" , r"(?-u:[a-\xFF])" ); |
290 | |
291 | // The following test that the printer escapes meta characters |
292 | // in character classes. |
293 | roundtrip(r"[\[]" , r"[\[]" ); |
294 | roundtrip(r"[Z-_]" , r"[Z-_]" ); |
295 | roundtrip(r"[Z-_--Z]" , r"[\[-_]" ); |
296 | |
297 | // The following test that the printer escapes meta characters |
298 | // in byte oriented character classes. |
299 | roundtrip_bytes(r"(?-u)[\[]" , r"(?-u:[\[])" ); |
300 | roundtrip_bytes(r"(?-u)[Z-_]" , r"(?-u:[Z-_])" ); |
301 | roundtrip_bytes(r"(?-u)[Z-_--Z]" , r"(?-u:[\[-_])" ); |
302 | } |
303 | |
304 | #[test] |
305 | fn print_anchor() { |
306 | roundtrip(r"^" , r"\A" ); |
307 | roundtrip(r"$" , r"\z" ); |
308 | roundtrip(r"(?m)^" , r"(?m:^)" ); |
309 | roundtrip(r"(?m)$" , r"(?m:$)" ); |
310 | } |
311 | |
312 | #[test] |
313 | fn print_word_boundary() { |
314 | roundtrip(r"\b" , r"\b" ); |
315 | roundtrip(r"\B" , r"\B" ); |
316 | roundtrip(r"(?-u)\b" , r"(?-u:\b)" ); |
317 | roundtrip_bytes(r"(?-u)\B" , r"(?-u:\B)" ); |
318 | } |
319 | |
320 | #[test] |
321 | fn print_repetition() { |
322 | roundtrip("a?" , "a?" ); |
323 | roundtrip("a??" , "a??" ); |
324 | roundtrip("(?U)a?" , "a??" ); |
325 | |
326 | roundtrip("a*" , "a*" ); |
327 | roundtrip("a*?" , "a*?" ); |
328 | roundtrip("(?U)a*" , "a*?" ); |
329 | |
330 | roundtrip("a+" , "a+" ); |
331 | roundtrip("a+?" , "a+?" ); |
332 | roundtrip("(?U)a+" , "a+?" ); |
333 | |
334 | roundtrip("a{1}" , "a{1}" ); |
335 | roundtrip("a{1,}" , "a{1,}" ); |
336 | roundtrip("a{1,5}" , "a{1,5}" ); |
337 | roundtrip("a{1}?" , "a{1}?" ); |
338 | roundtrip("a{1,}?" , "a{1,}?" ); |
339 | roundtrip("a{1,5}?" , "a{1,5}?" ); |
340 | roundtrip("(?U)a{1}" , "a{1}?" ); |
341 | roundtrip("(?U)a{1,}" , "a{1,}?" ); |
342 | roundtrip("(?U)a{1,5}" , "a{1,5}?" ); |
343 | } |
344 | |
345 | #[test] |
346 | fn print_group() { |
347 | roundtrip("()" , "()" ); |
348 | roundtrip("(?P<foo>)" , "(?P<foo>)" ); |
349 | roundtrip("(?:)" , "(?:)" ); |
350 | |
351 | roundtrip("(a)" , "(a)" ); |
352 | roundtrip("(?P<foo>a)" , "(?P<foo>a)" ); |
353 | roundtrip("(?:a)" , "(?:a)" ); |
354 | |
355 | roundtrip("((((a))))" , "((((a))))" ); |
356 | } |
357 | |
358 | #[test] |
359 | fn print_alternation() { |
360 | roundtrip("|" , "|" ); |
361 | roundtrip("||" , "||" ); |
362 | |
363 | roundtrip("a|b" , "a|b" ); |
364 | roundtrip("a|b|c" , "a|b|c" ); |
365 | roundtrip("foo|bar|quux" , "foo|bar|quux" ); |
366 | } |
367 | } |
368 | |