| 1 | /*! | 
| 2 | This module provides a regular expression printer for `Hir`. | 
|---|
| 3 | */ | 
|---|
| 4 |  | 
|---|
| 5 | use core::fmt; | 
|---|
| 6 |  | 
|---|
| 7 | use crate::{ | 
|---|
| 8 | hir::{ | 
|---|
| 9 | self, | 
|---|
| 10 | visitor::{self, Visitor}, | 
|---|
| 11 | Hir, HirKind, | 
|---|
| 12 | }, | 
|---|
| 13 | is_meta_character, | 
|---|
| 14 | }; | 
|---|
| 15 |  | 
|---|
| 16 | /// A builder for constructing a printer. | 
|---|
| 17 | /// | 
|---|
| 18 | /// Note that since a printer doesn't have any configuration knobs, this type | 
|---|
| 19 | /// remains unexported. | 
|---|
| 20 | #[ derive(Clone, Debug)] | 
|---|
| 21 | struct PrinterBuilder { | 
|---|
| 22 | _priv: (), | 
|---|
| 23 | } | 
|---|
| 24 |  | 
|---|
| 25 | impl Default for PrinterBuilder { | 
|---|
| 26 | fn default() -> PrinterBuilder { | 
|---|
| 27 | PrinterBuilder::new() | 
|---|
| 28 | } | 
|---|
| 29 | } | 
|---|
| 30 |  | 
|---|
| 31 | impl PrinterBuilder { | 
|---|
| 32 | fn new() -> PrinterBuilder { | 
|---|
| 33 | PrinterBuilder { _priv: () } | 
|---|
| 34 | } | 
|---|
| 35 |  | 
|---|
| 36 | fn build(&self) -> Printer { | 
|---|
| 37 | Printer { _priv: () } | 
|---|
| 38 | } | 
|---|
| 39 | } | 
|---|
| 40 |  | 
|---|
| 41 | /// A printer for a regular expression's high-level intermediate | 
|---|
| 42 | /// representation. | 
|---|
| 43 | /// | 
|---|
| 44 | /// A printer converts a high-level intermediate representation (HIR) to a | 
|---|
| 45 | /// regular expression pattern string. This particular printer uses constant | 
|---|
| 46 | /// stack space and heap space proportional to the size of the HIR. | 
|---|
| 47 | /// | 
|---|
| 48 | /// Since this printer is only using the HIR, the pattern it prints will likely | 
|---|
| 49 | /// not resemble the original pattern at all. For example, a pattern like | 
|---|
| 50 | /// `\pL` will have its entire class written out. | 
|---|
| 51 | /// | 
|---|
| 52 | /// The purpose of this printer is to provide a means to mutate an HIR and then | 
|---|
| 53 | /// build a regular expression from the result of that mutation. (A regex | 
|---|
| 54 | /// library could provide a constructor from this HIR explicitly, but that | 
|---|
| 55 | /// creates an unnecessary public coupling between the regex library and this | 
|---|
| 56 | /// specific HIR representation.) | 
|---|
| 57 | #[ derive(Debug)] | 
|---|
| 58 | pub struct Printer { | 
|---|
| 59 | _priv: (), | 
|---|
| 60 | } | 
|---|
| 61 |  | 
|---|
| 62 | impl Printer { | 
|---|
| 63 | /// Create a new printer. | 
|---|
| 64 | pub fn new() -> Printer { | 
|---|
| 65 | PrinterBuilder::new().build() | 
|---|
| 66 | } | 
|---|
| 67 |  | 
|---|
| 68 | /// Print the given `Ast` to the given writer. The writer must implement | 
|---|
| 69 | /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used | 
|---|
| 70 | /// here are a `fmt::Formatter` (which is available in `fmt::Display` | 
|---|
| 71 | /// implementations) or a `&mut String`. | 
|---|
| 72 | pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result { | 
|---|
| 73 | visitor::visit(hir, visitor:Writer { wtr }) | 
|---|
| 74 | } | 
|---|
| 75 | } | 
|---|
| 76 |  | 
|---|
| 77 | #[ derive(Debug)] | 
|---|
| 78 | struct Writer<W> { | 
|---|
| 79 | wtr: W, | 
|---|
| 80 | } | 
|---|
| 81 |  | 
|---|
| 82 | impl<W: fmt::Write> Visitor for Writer<W> { | 
|---|
| 83 | type Output = (); | 
|---|
| 84 | type Err = fmt::Error; | 
|---|
| 85 |  | 
|---|
| 86 | fn finish(self) -> fmt::Result { | 
|---|
| 87 | Ok(()) | 
|---|
| 88 | } | 
|---|
| 89 |  | 
|---|
| 90 | fn visit_pre(&mut self, hir: &Hir) -> fmt::Result { | 
|---|
| 91 | match *hir.kind() { | 
|---|
| 92 | HirKind::Empty => { | 
|---|
| 93 | // Technically an empty sub-expression could be "printed" by | 
|---|
| 94 | // just ignoring it, but in practice, you could have a | 
|---|
| 95 | // repetition operator attached to an empty expression, and you | 
|---|
| 96 | // really need something in the concrete syntax to make that | 
|---|
| 97 | // work as you'd expect. | 
|---|
| 98 | self.wtr.write_str( r"(?:)")?; | 
|---|
| 99 | } | 
|---|
| 100 | // Repetition operators are strictly suffix oriented. | 
|---|
| 101 | HirKind::Repetition(_) => {} | 
|---|
| 102 | HirKind::Literal(hir::Literal(ref bytes)) => { | 
|---|
| 103 | // See the comment on the 'Concat' and 'Alternation' case below | 
|---|
| 104 | // for why we put parens here. Literals are, conceptually, | 
|---|
| 105 | // a special case of concatenation where each element is a | 
|---|
| 106 | // character. The HIR flattens this into a Box<[u8]>, but we | 
|---|
| 107 | // still need to treat it like a concatenation for correct | 
|---|
| 108 | // printing. As a special case, we don't write parens if there | 
|---|
| 109 | // is only one character. One character means there is no | 
|---|
| 110 | // concat so we don't need parens. Adding parens would still be | 
|---|
| 111 | // correct, but we drop them here because it tends to create | 
|---|
| 112 | // rather noisy regexes even in simple cases. | 
|---|
| 113 | let result = core::str::from_utf8(bytes); | 
|---|
| 114 | let len = result.map_or(bytes.len(), |s| s.chars().count()); | 
|---|
| 115 | if len > 1 { | 
|---|
| 116 | self.wtr.write_str( r"(?:")?; | 
|---|
| 117 | } | 
|---|
| 118 | match result { | 
|---|
| 119 | Ok(string) => { | 
|---|
| 120 | for c in string.chars() { | 
|---|
| 121 | self.write_literal_char(c)?; | 
|---|
| 122 | } | 
|---|
| 123 | } | 
|---|
| 124 | Err(_) => { | 
|---|
| 125 | for &b in bytes.iter() { | 
|---|
| 126 | self.write_literal_byte(b)?; | 
|---|
| 127 | } | 
|---|
| 128 | } | 
|---|
| 129 | } | 
|---|
| 130 | if len > 1 { | 
|---|
| 131 | self.wtr.write_str( r")")?; | 
|---|
| 132 | } | 
|---|
| 133 | } | 
|---|
| 134 | HirKind::Class(hir::Class::Unicode(ref cls)) => { | 
|---|
| 135 | if cls.ranges().is_empty() { | 
|---|
| 136 | return self.wtr.write_str( "[a&&b]"); | 
|---|
| 137 | } | 
|---|
| 138 | self.wtr.write_str( "[")?; | 
|---|
| 139 | for range in cls.iter() { | 
|---|
| 140 | if range.start() == range.end() { | 
|---|
| 141 | self.write_literal_char(range.start())?; | 
|---|
| 142 | } else if u32::from(range.start()) + 1 | 
|---|
| 143 | == u32::from(range.end()) | 
|---|
| 144 | { | 
|---|
| 145 | self.write_literal_char(range.start())?; | 
|---|
| 146 | self.write_literal_char(range.end())?; | 
|---|
| 147 | } else { | 
|---|
| 148 | self.write_literal_char(range.start())?; | 
|---|
| 149 | self.wtr.write_str( "-")?; | 
|---|
| 150 | self.write_literal_char(range.end())?; | 
|---|
| 151 | } | 
|---|
| 152 | } | 
|---|
| 153 | self.wtr.write_str( "]")?; | 
|---|
| 154 | } | 
|---|
| 155 | HirKind::Class(hir::Class::Bytes(ref cls)) => { | 
|---|
| 156 | if cls.ranges().is_empty() { | 
|---|
| 157 | return self.wtr.write_str( "[a&&b]"); | 
|---|
| 158 | } | 
|---|
| 159 | self.wtr.write_str( "(?-u:[")?; | 
|---|
| 160 | for range in cls.iter() { | 
|---|
| 161 | if range.start() == range.end() { | 
|---|
| 162 | self.write_literal_class_byte(range.start())?; | 
|---|
| 163 | } else if range.start() + 1 == range.end() { | 
|---|
| 164 | self.write_literal_class_byte(range.start())?; | 
|---|
| 165 | self.write_literal_class_byte(range.end())?; | 
|---|
| 166 | } else { | 
|---|
| 167 | self.write_literal_class_byte(range.start())?; | 
|---|
| 168 | self.wtr.write_str( "-")?; | 
|---|
| 169 | self.write_literal_class_byte(range.end())?; | 
|---|
| 170 | } | 
|---|
| 171 | } | 
|---|
| 172 | self.wtr.write_str( "])")?; | 
|---|
| 173 | } | 
|---|
| 174 | HirKind::Look(ref look) => match *look { | 
|---|
| 175 | hir::Look::Start => { | 
|---|
| 176 | self.wtr.write_str( r"\A")?; | 
|---|
| 177 | } | 
|---|
| 178 | hir::Look::End => { | 
|---|
| 179 | self.wtr.write_str( r"\z")?; | 
|---|
| 180 | } | 
|---|
| 181 | hir::Look::StartLF => { | 
|---|
| 182 | self.wtr.write_str( "(?m:^)")?; | 
|---|
| 183 | } | 
|---|
| 184 | hir::Look::EndLF => { | 
|---|
| 185 | self.wtr.write_str( "(?m:$)")?; | 
|---|
| 186 | } | 
|---|
| 187 | hir::Look::StartCRLF => { | 
|---|
| 188 | self.wtr.write_str( "(?mR:^)")?; | 
|---|
| 189 | } | 
|---|
| 190 | hir::Look::EndCRLF => { | 
|---|
| 191 | self.wtr.write_str( "(?mR:$)")?; | 
|---|
| 192 | } | 
|---|
| 193 | hir::Look::WordAscii => { | 
|---|
| 194 | self.wtr.write_str( r"(?-u:\b)")?; | 
|---|
| 195 | } | 
|---|
| 196 | hir::Look::WordAsciiNegate => { | 
|---|
| 197 | self.wtr.write_str( r"(?-u:\B)")?; | 
|---|
| 198 | } | 
|---|
| 199 | hir::Look::WordUnicode => { | 
|---|
| 200 | self.wtr.write_str( r"\b")?; | 
|---|
| 201 | } | 
|---|
| 202 | hir::Look::WordUnicodeNegate => { | 
|---|
| 203 | self.wtr.write_str( r"\B")?; | 
|---|
| 204 | } | 
|---|
| 205 | hir::Look::WordStartAscii => { | 
|---|
| 206 | self.wtr.write_str( r"(?-u:\b{start})")?; | 
|---|
| 207 | } | 
|---|
| 208 | hir::Look::WordEndAscii => { | 
|---|
| 209 | self.wtr.write_str( r"(?-u:\b{end})")?; | 
|---|
| 210 | } | 
|---|
| 211 | hir::Look::WordStartUnicode => { | 
|---|
| 212 | self.wtr.write_str( r"\b{start}")?; | 
|---|
| 213 | } | 
|---|
| 214 | hir::Look::WordEndUnicode => { | 
|---|
| 215 | self.wtr.write_str( r"\b{end}")?; | 
|---|
| 216 | } | 
|---|
| 217 | hir::Look::WordStartHalfAscii => { | 
|---|
| 218 | self.wtr.write_str( r"(?-u:\b{start-half})")?; | 
|---|
| 219 | } | 
|---|
| 220 | hir::Look::WordEndHalfAscii => { | 
|---|
| 221 | self.wtr.write_str( r"(?-u:\b{end-half})")?; | 
|---|
| 222 | } | 
|---|
| 223 | hir::Look::WordStartHalfUnicode => { | 
|---|
| 224 | self.wtr.write_str( r"\b{start-half}")?; | 
|---|
| 225 | } | 
|---|
| 226 | hir::Look::WordEndHalfUnicode => { | 
|---|
| 227 | self.wtr.write_str( r"\b{end-half}")?; | 
|---|
| 228 | } | 
|---|
| 229 | }, | 
|---|
| 230 | HirKind::Capture(hir::Capture { ref name, .. }) => { | 
|---|
| 231 | self.wtr.write_str( "(")?; | 
|---|
| 232 | if let Some(ref name) = *name { | 
|---|
| 233 | write!(self.wtr, "?P<{} >", name)?; | 
|---|
| 234 | } | 
|---|
| 235 | } | 
|---|
| 236 | // Why do this? Wrapping concats and alts in non-capturing groups | 
|---|
| 237 | // is not *always* necessary, but is sometimes necessary. For | 
|---|
| 238 | // example, 'concat(a, alt(b, c))' should be written as 'a(?:b|c)' | 
|---|
| 239 | // and not 'ab|c'. The former is clearly the intended meaning, but | 
|---|
| 240 | // the latter is actually 'alt(concat(a, b), c)'. | 
|---|
| 241 | // | 
|---|
| 242 | // It would be possible to only group these things in cases where | 
|---|
| 243 | // it's strictly necessary, but it requires knowing the parent | 
|---|
| 244 | // expression. And since this technique is simpler and always | 
|---|
| 245 | // correct, we take this route. More to the point, it is a non-goal | 
|---|
| 246 | // of an HIR printer to show a nice easy-to-read regex. Indeed, | 
|---|
| 247 | // its construction forbids it from doing so. Therefore, inserting | 
|---|
| 248 | // extra groups where they aren't necessary is perfectly okay. | 
|---|
| 249 | HirKind::Concat(_) | HirKind::Alternation(_) => { | 
|---|
| 250 | self.wtr.write_str( r"(?:")?; | 
|---|
| 251 | } | 
|---|
| 252 | } | 
|---|
| 253 | Ok(()) | 
|---|
| 254 | } | 
|---|
| 255 |  | 
|---|
| 256 | fn visit_post(&mut self, hir: &Hir) -> fmt::Result { | 
|---|
| 257 | match *hir.kind() { | 
|---|
| 258 | // Handled during visit_pre | 
|---|
| 259 | HirKind::Empty | 
|---|
| 260 | | HirKind::Literal(_) | 
|---|
| 261 | | HirKind::Class(_) | 
|---|
| 262 | | HirKind::Look(_) => {} | 
|---|
| 263 | HirKind::Repetition(ref x) => { | 
|---|
| 264 | match (x.min, x.max) { | 
|---|
| 265 | (0, Some(1)) => { | 
|---|
| 266 | self.wtr.write_str( "?")?; | 
|---|
| 267 | } | 
|---|
| 268 | (0, None) => { | 
|---|
| 269 | self.wtr.write_str( "*")?; | 
|---|
| 270 | } | 
|---|
| 271 | (1, None) => { | 
|---|
| 272 | self.wtr.write_str( "+")?; | 
|---|
| 273 | } | 
|---|
| 274 | (1, Some(1)) => { | 
|---|
| 275 | // 'a{1}' and 'a{1}?' are exactly equivalent to 'a'. | 
|---|
| 276 | return Ok(()); | 
|---|
| 277 | } | 
|---|
| 278 | (m, None) => { | 
|---|
| 279 | write!(self.wtr, "{{{} ,}} ", m)?; | 
|---|
| 280 | } | 
|---|
| 281 | (m, Some(n)) if m == n => { | 
|---|
| 282 | write!(self.wtr, "{{{}}} ", m)?; | 
|---|
| 283 | // a{m} and a{m}? are always exactly equivalent. | 
|---|
| 284 | return Ok(()); | 
|---|
| 285 | } | 
|---|
| 286 | (m, Some(n)) => { | 
|---|
| 287 | write!(self.wtr, "{{{} ,{}}} ", m, n)?; | 
|---|
| 288 | } | 
|---|
| 289 | } | 
|---|
| 290 | if !x.greedy { | 
|---|
| 291 | self.wtr.write_str( "?")?; | 
|---|
| 292 | } | 
|---|
| 293 | } | 
|---|
| 294 | HirKind::Capture(_) | 
|---|
| 295 | | HirKind::Concat(_) | 
|---|
| 296 | | HirKind::Alternation(_) => { | 
|---|
| 297 | self.wtr.write_str( r")")?; | 
|---|
| 298 | } | 
|---|
| 299 | } | 
|---|
| 300 | Ok(()) | 
|---|
| 301 | } | 
|---|
| 302 |  | 
|---|
| 303 | fn visit_alternation_in(&mut self) -> fmt::Result { | 
|---|
| 304 | self.wtr.write_str( "|") | 
|---|
| 305 | } | 
|---|
| 306 | } | 
|---|
| 307 |  | 
|---|
| 308 | impl<W: fmt::Write> Writer<W> { | 
|---|
| 309 | fn write_literal_char(&mut self, c: char) -> fmt::Result { | 
|---|
| 310 | if is_meta_character(c) { | 
|---|
| 311 | self.wtr.write_str( "\\ ")?; | 
|---|
| 312 | } | 
|---|
| 313 | self.wtr.write_char(c) | 
|---|
| 314 | } | 
|---|
| 315 |  | 
|---|
| 316 | fn write_literal_byte(&mut self, b: u8) -> fmt::Result { | 
|---|
| 317 | if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { | 
|---|
| 318 | self.write_literal_char(char::try_from(b).unwrap()) | 
|---|
| 319 | } else { | 
|---|
| 320 | write!(self.wtr, "(?-u:\\ x{:02X} )", b) | 
|---|
| 321 | } | 
|---|
| 322 | } | 
|---|
| 323 |  | 
|---|
| 324 | fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result { | 
|---|
| 325 | if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { | 
|---|
| 326 | self.write_literal_char(char::try_from(b).unwrap()) | 
|---|
| 327 | } else { | 
|---|
| 328 | write!(self.wtr, "\\ x{:02X} ", b) | 
|---|
| 329 | } | 
|---|
| 330 | } | 
|---|
| 331 | } | 
|---|
| 332 |  | 
|---|
| 333 | #[ cfg(test)] | 
|---|
| 334 | mod tests { | 
|---|
| 335 | use alloc::{ | 
|---|
| 336 | boxed::Box, | 
|---|
| 337 | string::{String, ToString}, | 
|---|
| 338 | }; | 
|---|
| 339 |  | 
|---|
| 340 | use crate::ParserBuilder; | 
|---|
| 341 |  | 
|---|
| 342 | use super::*; | 
|---|
| 343 |  | 
|---|
| 344 | fn roundtrip(given: &str, expected: &str) { | 
|---|
| 345 | roundtrip_with(|b| b, given, expected); | 
|---|
| 346 | } | 
|---|
| 347 |  | 
|---|
| 348 | fn roundtrip_bytes(given: &str, expected: &str) { | 
|---|
| 349 | roundtrip_with(|b| b.utf8(false), given, expected); | 
|---|
| 350 | } | 
|---|
| 351 |  | 
|---|
| 352 | fn roundtrip_with<F>(mut f: F, given: &str, expected: &str) | 
|---|
| 353 | where | 
|---|
| 354 | F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder, | 
|---|
| 355 | { | 
|---|
| 356 | let mut builder = ParserBuilder::new(); | 
|---|
| 357 | f(&mut builder); | 
|---|
| 358 | let hir = builder.build().parse(given).unwrap(); | 
|---|
| 359 |  | 
|---|
| 360 | let mut printer = Printer::new(); | 
|---|
| 361 | let mut dst = String::new(); | 
|---|
| 362 | printer.print(&hir, &mut dst).unwrap(); | 
|---|
| 363 |  | 
|---|
| 364 | // Check that the result is actually valid. | 
|---|
| 365 | builder.build().parse(&dst).unwrap(); | 
|---|
| 366 |  | 
|---|
| 367 | assert_eq!(expected, dst); | 
|---|
| 368 | } | 
|---|
| 369 |  | 
|---|
| 370 | #[ test] | 
|---|
| 371 | fn print_literal() { | 
|---|
| 372 | roundtrip( "a", "a"); | 
|---|
| 373 | roundtrip( r"\xff", "\u{FF} "); | 
|---|
| 374 | roundtrip_bytes( r"\xff", "\u{FF} "); | 
|---|
| 375 | roundtrip_bytes( r"(?-u)\xff", r"(?-u:\xFF)"); | 
|---|
| 376 | roundtrip( "☃", "☃"); | 
|---|
| 377 | } | 
|---|
| 378 |  | 
|---|
| 379 | #[ test] | 
|---|
| 380 | fn print_class() { | 
|---|
| 381 | roundtrip( r"[a]", r"a"); | 
|---|
| 382 | roundtrip( r"[ab]", r"[ab]"); | 
|---|
| 383 | roundtrip( r"[a-z]", r"[a-z]"); | 
|---|
| 384 | roundtrip( r"[a-z--b-c--x-y]", r"[ad-wz]"); | 
|---|
| 385 | roundtrip( r"[^\x01-\u{10FFFF}]", "\u{0} "); | 
|---|
| 386 | roundtrip( r"[-]", r"\-"); | 
|---|
| 387 | roundtrip( r"[☃-⛄]", r"[☃-⛄]"); | 
|---|
| 388 |  | 
|---|
| 389 | roundtrip( r"(?-u)[a]", r"a"); | 
|---|
| 390 | roundtrip( r"(?-u)[ab]", r"(?-u:[ab])"); | 
|---|
| 391 | roundtrip( r"(?-u)[a-z]", r"(?-u:[a-z])"); | 
|---|
| 392 | roundtrip_bytes( r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])"); | 
|---|
| 393 |  | 
|---|
| 394 | // The following test that the printer escapes meta characters | 
|---|
| 395 | // in character classes. | 
|---|
| 396 | roundtrip( r"[\[]", r"\["); | 
|---|
| 397 | roundtrip( r"[Z-_]", r"[Z-_]"); | 
|---|
| 398 | roundtrip( r"[Z-_--Z]", r"[\[-_]"); | 
|---|
| 399 |  | 
|---|
| 400 | // The following test that the printer escapes meta characters | 
|---|
| 401 | // in byte oriented character classes. | 
|---|
| 402 | roundtrip_bytes( r"(?-u)[\[]", r"\["); | 
|---|
| 403 | roundtrip_bytes( r"(?-u)[Z-_]", r"(?-u:[Z-_])"); | 
|---|
| 404 | roundtrip_bytes( r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])"); | 
|---|
| 405 |  | 
|---|
| 406 | // This tests that an empty character class is correctly roundtripped. | 
|---|
| 407 | #[ cfg(feature = "unicode-gencat")] | 
|---|
| 408 | roundtrip( r"\P{any}", r"[a&&b]"); | 
|---|
| 409 | roundtrip_bytes( r"(?-u)[^\x00-\xFF]", r"[a&&b]"); | 
|---|
| 410 | } | 
|---|
| 411 |  | 
|---|
| 412 | #[ test] | 
|---|
| 413 | fn print_anchor() { | 
|---|
| 414 | roundtrip( r"^", r"\A"); | 
|---|
| 415 | roundtrip( r"$", r"\z"); | 
|---|
| 416 | roundtrip( r"(?m)^", r"(?m:^)"); | 
|---|
| 417 | roundtrip( r"(?m)$", r"(?m:$)"); | 
|---|
| 418 | } | 
|---|
| 419 |  | 
|---|
| 420 | #[ test] | 
|---|
| 421 | fn print_word_boundary() { | 
|---|
| 422 | roundtrip( r"\b", r"\b"); | 
|---|
| 423 | roundtrip( r"\B", r"\B"); | 
|---|
| 424 | roundtrip( r"(?-u)\b", r"(?-u:\b)"); | 
|---|
| 425 | roundtrip_bytes( r"(?-u)\B", r"(?-u:\B)"); | 
|---|
| 426 | } | 
|---|
| 427 |  | 
|---|
| 428 | #[ test] | 
|---|
| 429 | fn print_repetition() { | 
|---|
| 430 | roundtrip( "a?", "a?"); | 
|---|
| 431 | roundtrip( "a??", "a??"); | 
|---|
| 432 | roundtrip( "(?U)a?", "a??"); | 
|---|
| 433 |  | 
|---|
| 434 | roundtrip( "a*", "a*"); | 
|---|
| 435 | roundtrip( "a*?", "a*?"); | 
|---|
| 436 | roundtrip( "(?U)a*", "a*?"); | 
|---|
| 437 |  | 
|---|
| 438 | roundtrip( "a+", "a+"); | 
|---|
| 439 | roundtrip( "a+?", "a+?"); | 
|---|
| 440 | roundtrip( "(?U)a+", "a+?"); | 
|---|
| 441 |  | 
|---|
| 442 | roundtrip( "a{1}", "a"); | 
|---|
| 443 | roundtrip( "a{2}", "a{2}"); | 
|---|
| 444 | roundtrip( "a{1,}", "a+"); | 
|---|
| 445 | roundtrip( "a{1,5}", "a{1,5}"); | 
|---|
| 446 | roundtrip( "a{1}?", "a"); | 
|---|
| 447 | roundtrip( "a{2}?", "a{2}"); | 
|---|
| 448 | roundtrip( "a{1,}?", "a+?"); | 
|---|
| 449 | roundtrip( "a{1,5}?", "a{1,5}?"); | 
|---|
| 450 | roundtrip( "(?U)a{1}", "a"); | 
|---|
| 451 | roundtrip( "(?U)a{2}", "a{2}"); | 
|---|
| 452 | roundtrip( "(?U)a{1,}", "a+?"); | 
|---|
| 453 | roundtrip( "(?U)a{1,5}", "a{1,5}?"); | 
|---|
| 454 |  | 
|---|
| 455 | // Test that various zero-length repetitions always translate to an | 
|---|
| 456 | // empty regex. This is more a property of HIR's smart constructors | 
|---|
| 457 | // than the printer though. | 
|---|
| 458 | roundtrip( "a{0}", "(?:)"); | 
|---|
| 459 | roundtrip( "(?:ab){0}", "(?:)"); | 
|---|
| 460 | #[ cfg(feature = "unicode-gencat")] | 
|---|
| 461 | { | 
|---|
| 462 | roundtrip( r"\p{any}{0}", "(?:)"); | 
|---|
| 463 | roundtrip( r"\P{any}{0}", "(?:)"); | 
|---|
| 464 | } | 
|---|
| 465 | } | 
|---|
| 466 |  | 
|---|
| 467 | #[ test] | 
|---|
| 468 | fn print_group() { | 
|---|
| 469 | roundtrip( "()", "((?:))"); | 
|---|
| 470 | roundtrip( "(?P<foo>)", "(?P<foo>(?:))"); | 
|---|
| 471 | roundtrip( "(?:)", "(?:)"); | 
|---|
| 472 |  | 
|---|
| 473 | roundtrip( "(a)", "(a)"); | 
|---|
| 474 | roundtrip( "(?P<foo>a)", "(?P<foo>a)"); | 
|---|
| 475 | roundtrip( "(?:a)", "a"); | 
|---|
| 476 |  | 
|---|
| 477 | roundtrip( "((((a))))", "((((a))))"); | 
|---|
| 478 | } | 
|---|
| 479 |  | 
|---|
| 480 | #[ test] | 
|---|
| 481 | fn print_alternation() { | 
|---|
| 482 | roundtrip( "|", "(?:(?:)|(?:))"); | 
|---|
| 483 | roundtrip( "||", "(?:(?:)|(?:)|(?:))"); | 
|---|
| 484 |  | 
|---|
| 485 | roundtrip( "a|b", "[ab]"); | 
|---|
| 486 | roundtrip( "ab|cd", "(?:(?:ab)|(?:cd))"); | 
|---|
| 487 | roundtrip( "a|b|c", "[a-c]"); | 
|---|
| 488 | roundtrip( "ab|cd|ef", "(?:(?:ab)|(?:cd)|(?:ef))"); | 
|---|
| 489 | roundtrip( "foo|bar|quux", "(?:(?:foo)|(?:bar)|(?:quux))"); | 
|---|
| 490 | } | 
|---|
| 491 |  | 
|---|
| 492 | // This is a regression test that stresses a peculiarity of how the HIR | 
|---|
| 493 | // is both constructed and printed. Namely, it is legal for a repetition | 
|---|
| 494 | // to directly contain a concatenation. This particular construct isn't | 
|---|
| 495 | // really possible to build from the concrete syntax directly, since you'd | 
|---|
| 496 | // be forced to put the concatenation into (at least) a non-capturing | 
|---|
| 497 | // group. Concurrently, the printer doesn't consider this case and just | 
|---|
| 498 | // kind of naively prints the child expression and tacks on the repetition | 
|---|
| 499 | // operator. | 
|---|
| 500 | // | 
|---|
| 501 | // As a result, if you attached '+' to a 'concat(a, b)', the printer gives | 
|---|
| 502 | // you 'ab+', but clearly it really should be '(?:ab)+'. | 
|---|
| 503 | // | 
|---|
| 504 | // This bug isn't easy to surface because most ways of building an HIR | 
|---|
| 505 | // come directly from the concrete syntax, and as mentioned above, it just | 
|---|
| 506 | // isn't possible to build this kind of HIR from the concrete syntax. | 
|---|
| 507 | // Nevertheless, this is definitely a bug. | 
|---|
| 508 | // | 
|---|
| 509 | // See: https://github.com/rust-lang/regex/issues/731 | 
|---|
| 510 | #[ test] | 
|---|
| 511 | fn regression_repetition_concat() { | 
|---|
| 512 | let expr = Hir::concat(alloc::vec![ | 
|---|
| 513 | Hir::literal( "x".as_bytes()), | 
|---|
| 514 | Hir::repetition(hir::Repetition { | 
|---|
| 515 | min: 1, | 
|---|
| 516 | max: None, | 
|---|
| 517 | greedy: true, | 
|---|
| 518 | sub: Box::new(Hir::literal( "ab".as_bytes())), | 
|---|
| 519 | }), | 
|---|
| 520 | Hir::literal( "y".as_bytes()), | 
|---|
| 521 | ]); | 
|---|
| 522 | assert_eq!( r"(?:x(?:ab)+y)", expr.to_string()); | 
|---|
| 523 |  | 
|---|
| 524 | let expr = Hir::concat(alloc::vec![ | 
|---|
| 525 | Hir::look(hir::Look::Start), | 
|---|
| 526 | Hir::repetition(hir::Repetition { | 
|---|
| 527 | min: 1, | 
|---|
| 528 | max: None, | 
|---|
| 529 | greedy: true, | 
|---|
| 530 | sub: Box::new(Hir::concat(alloc::vec![ | 
|---|
| 531 | Hir::look(hir::Look::Start), | 
|---|
| 532 | Hir::look(hir::Look::End), | 
|---|
| 533 | ])), | 
|---|
| 534 | }), | 
|---|
| 535 | Hir::look(hir::Look::End), | 
|---|
| 536 | ]); | 
|---|
| 537 | assert_eq!( r"(?:\A\A\z\z)", expr.to_string()); | 
|---|
| 538 | } | 
|---|
| 539 |  | 
|---|
| 540 | // Just like regression_repetition_concat, but with the repetition using | 
|---|
| 541 | // an alternation as a child expression instead. | 
|---|
| 542 | // | 
|---|
| 543 | // See: https://github.com/rust-lang/regex/issues/731 | 
|---|
| 544 | #[ test] | 
|---|
| 545 | fn regression_repetition_alternation() { | 
|---|
| 546 | let expr = Hir::concat(alloc::vec![ | 
|---|
| 547 | Hir::literal( "ab".as_bytes()), | 
|---|
| 548 | Hir::repetition(hir::Repetition { | 
|---|
| 549 | min: 1, | 
|---|
| 550 | max: None, | 
|---|
| 551 | greedy: true, | 
|---|
| 552 | sub: Box::new(Hir::alternation(alloc::vec![ | 
|---|
| 553 | Hir::literal( "cd".as_bytes()), | 
|---|
| 554 | Hir::literal( "ef".as_bytes()), | 
|---|
| 555 | ])), | 
|---|
| 556 | }), | 
|---|
| 557 | Hir::literal( "gh".as_bytes()), | 
|---|
| 558 | ]); | 
|---|
| 559 | assert_eq!( r"(?:(?:ab)(?:(?:cd)|(?:ef))+(?:gh))", expr.to_string()); | 
|---|
| 560 |  | 
|---|
| 561 | let expr = Hir::concat(alloc::vec![ | 
|---|
| 562 | Hir::look(hir::Look::Start), | 
|---|
| 563 | Hir::repetition(hir::Repetition { | 
|---|
| 564 | min: 1, | 
|---|
| 565 | max: None, | 
|---|
| 566 | greedy: true, | 
|---|
| 567 | sub: Box::new(Hir::alternation(alloc::vec![ | 
|---|
| 568 | Hir::look(hir::Look::Start), | 
|---|
| 569 | Hir::look(hir::Look::End), | 
|---|
| 570 | ])), | 
|---|
| 571 | }), | 
|---|
| 572 | Hir::look(hir::Look::End), | 
|---|
| 573 | ]); | 
|---|
| 574 | assert_eq!( r"(?:\A(?:\A|\z)\z)", expr.to_string()); | 
|---|
| 575 | } | 
|---|
| 576 |  | 
|---|
| 577 | // This regression test is very similar in flavor to | 
|---|
| 578 | // regression_repetition_concat in that the root of the issue lies in a | 
|---|
| 579 | // peculiarity of how the HIR is represented and how the printer writes it | 
|---|
| 580 | // out. Like the other regression, this one is also rooted in the fact that | 
|---|
| 581 | // you can't produce the peculiar HIR from the concrete syntax. Namely, you | 
|---|
| 582 | // just can't have a 'concat(a, alt(b, c))' because the 'alt' will normally | 
|---|
| 583 | // be in (at least) a non-capturing group. Why? Because the '|' has very | 
|---|
| 584 | // low precedence (lower that concatenation), and so something like 'ab|c' | 
|---|
| 585 | // is actually 'alt(ab, c)'. | 
|---|
| 586 | // | 
|---|
| 587 | // See: https://github.com/rust-lang/regex/issues/516 | 
|---|
| 588 | #[ test] | 
|---|
| 589 | fn regression_alternation_concat() { | 
|---|
| 590 | let expr = Hir::concat(alloc::vec![ | 
|---|
| 591 | Hir::literal( "ab".as_bytes()), | 
|---|
| 592 | Hir::alternation(alloc::vec![ | 
|---|
| 593 | Hir::literal( "mn".as_bytes()), | 
|---|
| 594 | Hir::literal( "xy".as_bytes()), | 
|---|
| 595 | ]), | 
|---|
| 596 | ]); | 
|---|
| 597 | assert_eq!( r"(?:(?:ab)(?:(?:mn)|(?:xy)))", expr.to_string()); | 
|---|
| 598 |  | 
|---|
| 599 | let expr = Hir::concat(alloc::vec![ | 
|---|
| 600 | Hir::look(hir::Look::Start), | 
|---|
| 601 | Hir::alternation(alloc::vec![ | 
|---|
| 602 | Hir::look(hir::Look::Start), | 
|---|
| 603 | Hir::look(hir::Look::End), | 
|---|
| 604 | ]), | 
|---|
| 605 | ]); | 
|---|
| 606 | assert_eq!( r"(?:\A(?:\A|\z))", expr.to_string()); | 
|---|
| 607 | } | 
|---|
| 608 | } | 
|---|
| 609 |  | 
|---|