| 1 | /*! |
| 2 | Provides routines for interpolating capture group references. |
| 3 | |
| 4 | That is, if a replacement string contains references like `$foo` or `${foo1}`, |
| 5 | then they are replaced with the corresponding capture values for the groups |
| 6 | named `foo` and `foo1`, respectively. Similarly, syntax like `$1` and `${1}` |
| 7 | is supported as well, with `1` corresponding to a capture group index and not |
| 8 | a name. |
| 9 | |
| 10 | This module provides the free functions [`string`] and [`bytes`], which |
| 11 | interpolate Rust Unicode strings and byte strings, respectively. |
| 12 | |
| 13 | # Format |
| 14 | |
| 15 | These routines support two different kinds of capture references: unbraced and |
| 16 | braced. |
| 17 | |
| 18 | For the unbraced format, the format supported is `$ref` where `name` can be |
| 19 | any character in the class `[0-9A-Za-z_]`. `ref` is always the longest |
| 20 | possible parse. So for example, `$1a` corresponds to the capture group named |
| 21 | `1a` and not the capture group at index `1`. If `ref` matches `^[0-9]+$`, then |
| 22 | it is treated as a capture group index itself and not a name. |
| 23 | |
| 24 | For the braced format, the format supported is `${ref}` where `ref` can be any |
| 25 | sequence of bytes except for `}`. If no closing brace occurs, then it is not |
| 26 | considered a capture reference. As with the unbraced format, if `ref` matches |
| 27 | `^[0-9]+$`, then it is treated as a capture group index and not a name. |
| 28 | |
| 29 | The braced format is useful for exerting precise control over the name of the |
| 30 | capture reference. For example, `${1}a` corresponds to the capture group |
| 31 | reference `1` followed by the letter `a`, where as `$1a` (as mentioned above) |
| 32 | corresponds to the capture group reference `1a`. The braced format is also |
| 33 | useful for expressing capture group names that use characters not supported by |
| 34 | the unbraced format. For example, `${foo[bar].baz}` refers to the capture group |
| 35 | named `foo[bar].baz`. |
| 36 | |
| 37 | If a capture group reference is found and it does not refer to a valid capture |
| 38 | group, then it will be replaced with the empty string. |
| 39 | |
| 40 | To write a literal `$`, use `$$`. |
| 41 | |
| 42 | To be clear, and as exhibited via the type signatures in the routines in this |
| 43 | module, it is impossible for a replacement string to be invalid. A replacement |
| 44 | string may not have the intended semantics, but the interpolation procedure |
| 45 | itself can never fail. |
| 46 | */ |
| 47 | |
| 48 | use alloc::string::String; |
| 49 | |
| 50 | /// Accepts a replacement string and interpolates capture references with their |
| 51 | /// corresponding values. |
| 52 | /// |
| 53 | /// `append` should be a function that appends the string value of a capture |
| 54 | /// group at a particular index to the string given. If the capture group |
| 55 | /// index is invalid, then nothing should be appended. |
| 56 | /// |
| 57 | /// `name_to_index` should be a function that maps a capture group name to a |
| 58 | /// capture group index. If the given name doesn't exist, then `None` should |
| 59 | /// be returned. |
| 60 | /// |
| 61 | /// Finally, `dst` is where the final interpolated contents should be written. |
| 62 | /// If `replacement` contains no capture group references, then `dst` will be |
| 63 | /// equivalent to `replacement`. |
| 64 | /// |
| 65 | /// See the [module documentation](self) for details about the format |
| 66 | /// supported. |
| 67 | pub fn string( |
| 68 | mut replacement: &str, |
| 69 | mut append: impl FnMut(usize, &mut String), |
| 70 | mut name_to_index: impl FnMut(&str) -> Option<usize>, |
| 71 | dst: &mut String, |
| 72 | ) { |
| 73 | while !replacement.is_empty() { |
| 74 | match replacement.find('$' ) { |
| 75 | None => break, |
| 76 | Some(i) => { |
| 77 | dst.push_str(&replacement[..i]); |
| 78 | replacement = &replacement[i..]; |
| 79 | } |
| 80 | } |
| 81 | // Handle escaping of '$'. |
| 82 | if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$' ) { |
| 83 | dst.push_str("$" ); |
| 84 | replacement = &replacement[2..]; |
| 85 | continue; |
| 86 | } |
| 87 | debug_assert!(!replacement.is_empty()); |
| 88 | let cap_ref = match find_cap_ref(replacement.as_bytes()) { |
| 89 | Some(cap_ref) => cap_ref, |
| 90 | None => { |
| 91 | dst.push_str("$" ); |
| 92 | replacement = &replacement[1..]; |
| 93 | continue; |
| 94 | } |
| 95 | }; |
| 96 | replacement = &replacement[cap_ref.end..]; |
| 97 | match cap_ref.cap { |
| 98 | Ref::Number(i) => append(i, dst), |
| 99 | Ref::Named(name) => { |
| 100 | if let Some(i) = name_to_index(name) { |
| 101 | append(i, dst); |
| 102 | } |
| 103 | } |
| 104 | } |
| 105 | } |
| 106 | dst.push_str(replacement); |
| 107 | } |
| 108 | |
| 109 | /* |
| 110 | This should be uncommented and used if we ever provide public APIs for |
| 111 | searching `&[u8]`. |
| 112 | |
| 113 | /// Accepts a replacement byte string and interpolates capture references with |
| 114 | /// their corresponding values. |
| 115 | /// |
| 116 | /// `append` should be a function that appends the byte string value of a |
| 117 | /// capture group at a particular index to the byte string given. If the |
| 118 | /// capture group index is invalid, then nothing should be appended. |
| 119 | /// |
| 120 | /// `name_to_index` should be a function that maps a capture group name to a |
| 121 | /// capture group index. If the given name doesn't exist, then `None` should |
| 122 | /// be returned. |
| 123 | /// |
| 124 | /// Finally, `dst` is where the final interpolated contents should be written. |
| 125 | /// If `replacement` contains no capture group references, then `dst` will be |
| 126 | /// equivalent to `replacement`. |
| 127 | /// |
| 128 | /// See the [module documentation](self) for details about the format |
| 129 | /// supported. |
| 130 | pub fn bytes( |
| 131 | mut replacement: &[u8], |
| 132 | mut append: impl FnMut(usize, &mut Vec<u8>), |
| 133 | mut name_to_index: impl FnMut(&str) -> Option<usize>, |
| 134 | dst: &mut Vec<u8>, |
| 135 | ) { |
| 136 | while !replacement.is_empty() { |
| 137 | match replacement.iter().position(|&b| b == b'$') { |
| 138 | None => break, |
| 139 | Some(i) => { |
| 140 | dst.extend_from_slice(&replacement[..i]); |
| 141 | replacement = &replacement[i..]; |
| 142 | } |
| 143 | } |
| 144 | // Handle escaping of '$'. |
| 145 | if replacement.get(1).map_or(false, |&b| b == b'$') { |
| 146 | dst.push(b'$'); |
| 147 | replacement = &replacement[2..]; |
| 148 | continue; |
| 149 | } |
| 150 | debug_assert!(!replacement.is_empty()); |
| 151 | let cap_ref = match find_cap_ref(replacement) { |
| 152 | Some(cap_ref) => cap_ref, |
| 153 | None => { |
| 154 | dst.push(b'$'); |
| 155 | replacement = &replacement[1..]; |
| 156 | continue; |
| 157 | } |
| 158 | }; |
| 159 | replacement = &replacement[cap_ref.end..]; |
| 160 | match cap_ref.cap { |
| 161 | Ref::Number(i) => append(i, dst), |
| 162 | Ref::Named(name) => { |
| 163 | if let Some(i) = name_to_index(name) { |
| 164 | append(i, dst); |
| 165 | } |
| 166 | } |
| 167 | } |
| 168 | } |
| 169 | dst.extend_from_slice(replacement); |
| 170 | } |
| 171 | */ |
| 172 | |
| 173 | /// `CaptureRef` represents a reference to a capture group inside some text. |
| 174 | /// The reference is either a capture group name or a number. |
| 175 | /// |
| 176 | /// It is also tagged with the position in the text following the |
| 177 | /// capture reference. |
| 178 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
| 179 | struct CaptureRef<'a> { |
| 180 | cap: Ref<'a>, |
| 181 | end: usize, |
| 182 | } |
| 183 | |
| 184 | /// A reference to a capture group in some text. |
| 185 | /// |
| 186 | /// e.g., `$2`, `$foo`, `${foo}`. |
| 187 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
| 188 | enum Ref<'a> { |
| 189 | Named(&'a str), |
| 190 | Number(usize), |
| 191 | } |
| 192 | |
| 193 | impl<'a> From<&'a str> for Ref<'a> { |
| 194 | fn from(x: &'a str) -> Ref<'a> { |
| 195 | Ref::Named(x) |
| 196 | } |
| 197 | } |
| 198 | |
| 199 | impl From<usize> for Ref<'static> { |
| 200 | fn from(x: usize) -> Ref<'static> { |
| 201 | Ref::Number(x) |
| 202 | } |
| 203 | } |
| 204 | |
| 205 | /// Parses a possible reference to a capture group name in the given text, |
| 206 | /// starting at the beginning of `replacement`. |
| 207 | /// |
| 208 | /// If no such valid reference could be found, None is returned. |
| 209 | /// |
| 210 | /// Note that this returns a "possible" reference because this routine doesn't |
| 211 | /// know whether the reference is to a valid group or not. If it winds up not |
| 212 | /// being a valid reference, then it should be replaced with the empty string. |
| 213 | fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> { |
| 214 | let mut i = 0; |
| 215 | let rep: &[u8] = replacement; |
| 216 | if rep.len() <= 1 || rep[0] != b'$' { |
| 217 | return None; |
| 218 | } |
| 219 | i += 1; |
| 220 | if rep[i] == b'{' { |
| 221 | return find_cap_ref_braced(rep, i + 1); |
| 222 | } |
| 223 | let mut cap_end = i; |
| 224 | while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) { |
| 225 | cap_end += 1; |
| 226 | } |
| 227 | if cap_end == i { |
| 228 | return None; |
| 229 | } |
| 230 | // We just verified that the range 0..cap_end is valid ASCII, so it must |
| 231 | // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 |
| 232 | // check via an unchecked conversion or by parsing the number straight from |
| 233 | // &[u8]. |
| 234 | let cap = core::str::from_utf8(&rep[i..cap_end]) |
| 235 | .expect("valid UTF-8 capture name" ); |
| 236 | Some(CaptureRef { |
| 237 | cap: match cap.parse::<usize>() { |
| 238 | Ok(i) => Ref::Number(i), |
| 239 | Err(_) => Ref::Named(cap), |
| 240 | }, |
| 241 | end: cap_end, |
| 242 | }) |
| 243 | } |
| 244 | |
| 245 | /// Looks for a braced reference, e.g., `${foo1}`. This assumes that an opening |
| 246 | /// brace has been found at `i-1` in `rep`. This then looks for a closing |
| 247 | /// brace and returns the capture reference within the brace. |
| 248 | fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> { |
| 249 | assert_eq!(b'{' , rep[i.checked_sub(1).unwrap()]); |
| 250 | let start = i; |
| 251 | while rep.get(i).map_or(false, |&b| b != b'}' ) { |
| 252 | i += 1; |
| 253 | } |
| 254 | if !rep.get(i).map_or(false, |&b| b == b'}' ) { |
| 255 | return None; |
| 256 | } |
| 257 | // When looking at braced names, we don't put any restrictions on the name, |
| 258 | // so it's possible it could be invalid UTF-8. But a capture group name |
| 259 | // can never be invalid UTF-8, so if we have invalid UTF-8, then we can |
| 260 | // safely return None. |
| 261 | let cap = match core::str::from_utf8(&rep[start..i]) { |
| 262 | Err(_) => return None, |
| 263 | Ok(cap) => cap, |
| 264 | }; |
| 265 | Some(CaptureRef { |
| 266 | cap: match cap.parse::<usize>() { |
| 267 | Ok(i) => Ref::Number(i), |
| 268 | Err(_) => Ref::Named(cap), |
| 269 | }, |
| 270 | end: i + 1, |
| 271 | }) |
| 272 | } |
| 273 | |
| 274 | /// Returns true if and only if the given byte is allowed in a capture name |
| 275 | /// written in non-brace form. |
| 276 | fn is_valid_cap_letter(b: u8) -> bool { |
| 277 | match b { |
| 278 | b'0' ..=b'9' | b'a' ..=b'z' | b'A' ..=b'Z' | b'_' => true, |
| 279 | _ => false, |
| 280 | } |
| 281 | } |
| 282 | |
| 283 | #[cfg (test)] |
| 284 | mod tests { |
| 285 | use alloc::{string::String, vec, vec::Vec}; |
| 286 | |
| 287 | use super::{find_cap_ref, CaptureRef}; |
| 288 | |
| 289 | macro_rules! find { |
| 290 | ($name:ident, $text:expr) => { |
| 291 | #[test] |
| 292 | fn $name() { |
| 293 | assert_eq!(None, find_cap_ref($text.as_bytes())); |
| 294 | } |
| 295 | }; |
| 296 | ($name:ident, $text:expr, $capref:expr) => { |
| 297 | #[test] |
| 298 | fn $name() { |
| 299 | assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); |
| 300 | } |
| 301 | }; |
| 302 | } |
| 303 | |
| 304 | macro_rules! c { |
| 305 | ($name_or_number:expr, $pos:expr) => { |
| 306 | CaptureRef { cap: $name_or_number.into(), end: $pos } |
| 307 | }; |
| 308 | } |
| 309 | |
| 310 | find!(find_cap_ref1, "$foo" , c!("foo" , 4)); |
| 311 | find!(find_cap_ref2, "${foo}" , c!("foo" , 6)); |
| 312 | find!(find_cap_ref3, "$0" , c!(0, 2)); |
| 313 | find!(find_cap_ref4, "$5" , c!(5, 2)); |
| 314 | find!(find_cap_ref5, "$10" , c!(10, 3)); |
| 315 | // See https://github.com/rust-lang/regex/pull/585 |
| 316 | // for more on characters following numbers |
| 317 | find!(find_cap_ref6, "$42a" , c!("42a" , 4)); |
| 318 | find!(find_cap_ref7, "${42}a" , c!(42, 5)); |
| 319 | find!(find_cap_ref8, "${42" ); |
| 320 | find!(find_cap_ref9, "${42 " ); |
| 321 | find!(find_cap_ref10, " $0 " ); |
| 322 | find!(find_cap_ref11, "$" ); |
| 323 | find!(find_cap_ref12, " " ); |
| 324 | find!(find_cap_ref13, "" ); |
| 325 | find!(find_cap_ref14, "$1-$2" , c!(1, 2)); |
| 326 | find!(find_cap_ref15, "$1_$2" , c!("1_" , 3)); |
| 327 | find!(find_cap_ref16, "$x-$y" , c!("x" , 2)); |
| 328 | find!(find_cap_ref17, "$x_$y" , c!("x_" , 3)); |
| 329 | find!(find_cap_ref18, "${#}" , c!("#" , 4)); |
| 330 | find!(find_cap_ref19, "${Z[}" , c!("Z[" , 5)); |
| 331 | find!(find_cap_ref20, "${¾}" , c!("¾" , 5)); |
| 332 | find!(find_cap_ref21, "${¾a}" , c!("¾a" , 6)); |
| 333 | find!(find_cap_ref22, "${a¾}" , c!("a¾" , 6)); |
| 334 | find!(find_cap_ref23, "${☃}" , c!("☃" , 6)); |
| 335 | find!(find_cap_ref24, "${a☃}" , c!("a☃" , 7)); |
| 336 | find!(find_cap_ref25, "${☃a}" , c!("☃a" , 7)); |
| 337 | find!(find_cap_ref26, "${名字}" , c!("名字" , 9)); |
| 338 | |
| 339 | fn interpolate_string( |
| 340 | mut name_to_index: Vec<(&'static str, usize)>, |
| 341 | caps: Vec<&'static str>, |
| 342 | replacement: &str, |
| 343 | ) -> String { |
| 344 | name_to_index.sort_by_key(|x| x.0); |
| 345 | |
| 346 | let mut dst = String::new(); |
| 347 | super::string( |
| 348 | replacement, |
| 349 | |i, dst| { |
| 350 | if let Some(&s) = caps.get(i) { |
| 351 | dst.push_str(s); |
| 352 | } |
| 353 | }, |
| 354 | |name| -> Option<usize> { |
| 355 | name_to_index |
| 356 | .binary_search_by_key(&name, |x| x.0) |
| 357 | .ok() |
| 358 | .map(|i| name_to_index[i].1) |
| 359 | }, |
| 360 | &mut dst, |
| 361 | ); |
| 362 | dst |
| 363 | } |
| 364 | |
| 365 | /* |
| 366 | fn interpolate_bytes( |
| 367 | mut name_to_index: Vec<(&'static str, usize)>, |
| 368 | caps: Vec<&'static str>, |
| 369 | replacement: &str, |
| 370 | ) -> String { |
| 371 | name_to_index.sort_by_key(|x| x.0); |
| 372 | |
| 373 | let mut dst = vec![]; |
| 374 | super::bytes( |
| 375 | replacement.as_bytes(), |
| 376 | |i, dst| { |
| 377 | if let Some(&s) = caps.get(i) { |
| 378 | dst.extend_from_slice(s.as_bytes()); |
| 379 | } |
| 380 | }, |
| 381 | |name| -> Option<usize> { |
| 382 | name_to_index |
| 383 | .binary_search_by_key(&name, |x| x.0) |
| 384 | .ok() |
| 385 | .map(|i| name_to_index[i].1) |
| 386 | }, |
| 387 | &mut dst, |
| 388 | ); |
| 389 | String::from_utf8(dst).unwrap() |
| 390 | } |
| 391 | */ |
| 392 | |
| 393 | macro_rules! interp { |
| 394 | ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => { |
| 395 | #[test] |
| 396 | fn $name() { |
| 397 | assert_eq!( |
| 398 | $expected, |
| 399 | interpolate_string($map, $caps, $hay), |
| 400 | "interpolate::string failed" , |
| 401 | ); |
| 402 | /* |
| 403 | assert_eq!( |
| 404 | $expected, |
| 405 | interpolate_bytes($map, $caps, $hay), |
| 406 | "interpolate::bytes failed", |
| 407 | ); |
| 408 | */ |
| 409 | } |
| 410 | }; |
| 411 | } |
| 412 | |
| 413 | interp!( |
| 414 | interp1, |
| 415 | vec![("foo" , 2)], |
| 416 | vec!["" , "" , "xxx" ], |
| 417 | "test $foo test" , |
| 418 | "test xxx test" , |
| 419 | ); |
| 420 | |
| 421 | interp!( |
| 422 | interp2, |
| 423 | vec![("foo" , 2)], |
| 424 | vec!["" , "" , "xxx" ], |
| 425 | "test$footest" , |
| 426 | "test" , |
| 427 | ); |
| 428 | |
| 429 | interp!( |
| 430 | interp3, |
| 431 | vec![("foo" , 2)], |
| 432 | vec!["" , "" , "xxx" ], |
| 433 | "test${foo}test" , |
| 434 | "testxxxtest" , |
| 435 | ); |
| 436 | |
| 437 | interp!( |
| 438 | interp4, |
| 439 | vec![("foo" , 2)], |
| 440 | vec!["" , "" , "xxx" ], |
| 441 | "test$2test" , |
| 442 | "test" , |
| 443 | ); |
| 444 | |
| 445 | interp!( |
| 446 | interp5, |
| 447 | vec![("foo" , 2)], |
| 448 | vec!["" , "" , "xxx" ], |
| 449 | "test${2}test" , |
| 450 | "testxxxtest" , |
| 451 | ); |
| 452 | |
| 453 | interp!( |
| 454 | interp6, |
| 455 | vec![("foo" , 2)], |
| 456 | vec!["" , "" , "xxx" ], |
| 457 | "test $$foo test" , |
| 458 | "test $foo test" , |
| 459 | ); |
| 460 | |
| 461 | interp!( |
| 462 | interp7, |
| 463 | vec![("foo" , 2)], |
| 464 | vec!["" , "" , "xxx" ], |
| 465 | "test $foo" , |
| 466 | "test xxx" , |
| 467 | ); |
| 468 | |
| 469 | interp!( |
| 470 | interp8, |
| 471 | vec![("foo" , 2)], |
| 472 | vec!["" , "" , "xxx" ], |
| 473 | "$foo test" , |
| 474 | "xxx test" , |
| 475 | ); |
| 476 | |
| 477 | interp!( |
| 478 | interp9, |
| 479 | vec![("bar" , 1), ("foo" , 2)], |
| 480 | vec!["" , "yyy" , "xxx" ], |
| 481 | "test $bar$foo" , |
| 482 | "test yyyxxx" , |
| 483 | ); |
| 484 | |
| 485 | interp!( |
| 486 | interp10, |
| 487 | vec![("bar" , 1), ("foo" , 2)], |
| 488 | vec!["" , "yyy" , "xxx" ], |
| 489 | "test $ test" , |
| 490 | "test $ test" , |
| 491 | ); |
| 492 | |
| 493 | interp!( |
| 494 | interp11, |
| 495 | vec![("bar" , 1), ("foo" , 2)], |
| 496 | vec!["" , "yyy" , "xxx" ], |
| 497 | "test ${} test" , |
| 498 | "test test" , |
| 499 | ); |
| 500 | |
| 501 | interp!( |
| 502 | interp12, |
| 503 | vec![("bar" , 1), ("foo" , 2)], |
| 504 | vec!["" , "yyy" , "xxx" ], |
| 505 | "test ${ } test" , |
| 506 | "test test" , |
| 507 | ); |
| 508 | |
| 509 | interp!( |
| 510 | interp13, |
| 511 | vec![("bar" , 1), ("foo" , 2)], |
| 512 | vec!["" , "yyy" , "xxx" ], |
| 513 | "test ${a b} test" , |
| 514 | "test test" , |
| 515 | ); |
| 516 | |
| 517 | interp!( |
| 518 | interp14, |
| 519 | vec![("bar" , 1), ("foo" , 2)], |
| 520 | vec!["" , "yyy" , "xxx" ], |
| 521 | "test ${a} test" , |
| 522 | "test test" , |
| 523 | ); |
| 524 | |
| 525 | // This is a funny case where a braced reference is never closed, but |
| 526 | // within the unclosed braced reference, there is an unbraced reference. |
| 527 | // In this case, the braced reference is just treated literally and the |
| 528 | // unbraced reference is found. |
| 529 | interp!( |
| 530 | interp15, |
| 531 | vec![("bar" , 1), ("foo" , 2)], |
| 532 | vec!["" , "yyy" , "xxx" ], |
| 533 | "test ${wat $bar ok" , |
| 534 | "test ${wat yyy ok" , |
| 535 | ); |
| 536 | } |
| 537 | |