1 | /*! |
2 | Provides routines for interpolating capture group references. |
3 | |
4 | That is, if a replacement string contains references like `$foo` or `${foo1}`, |
5 | then they are replaced with the corresponding capture values for the groups |
6 | named `foo` and `foo1`, respectively. Similarly, syntax like `$1` and `${1}` |
7 | is supported as well, with `1` corresponding to a capture group index and not |
8 | a name. |
9 | |
10 | This module provides the free functions [`string`] and [`bytes`], which |
11 | interpolate Rust Unicode strings and byte strings, respectively. |
12 | |
13 | # Format |
14 | |
15 | These routines support two different kinds of capture references: unbraced and |
16 | braced. |
17 | |
18 | For the unbraced format, the format supported is `$ref` where `name` can be |
19 | any character in the class `[0-9A-Za-z_]`. `ref` is always the longest |
20 | possible parse. So for example, `$1a` corresponds to the capture group named |
21 | `1a` and not the capture group at index `1`. If `ref` matches `^[0-9]+$`, then |
22 | it is treated as a capture group index itself and not a name. |
23 | |
24 | For the braced format, the format supported is `${ref}` where `ref` can be any |
25 | sequence of bytes except for `}`. If no closing brace occurs, then it is not |
26 | considered a capture reference. As with the unbraced format, if `ref` matches |
27 | `^[0-9]+$`, then it is treated as a capture group index and not a name. |
28 | |
29 | The braced format is useful for exerting precise control over the name of the |
30 | capture reference. For example, `${1}a` corresponds to the capture group |
31 | reference `1` followed by the letter `a`, where as `$1a` (as mentioned above) |
32 | corresponds to the capture group reference `1a`. The braced format is also |
33 | useful for expressing capture group names that use characters not supported by |
34 | the unbraced format. For example, `${foo[bar].baz}` refers to the capture group |
35 | named `foo[bar].baz`. |
36 | |
37 | If a capture group reference is found and it does not refer to a valid capture |
38 | group, then it will be replaced with the empty string. |
39 | |
40 | To write a literal `$`, use `$$`. |
41 | |
42 | To be clear, and as exhibited via the type signatures in the routines in this |
43 | module, it is impossible for a replacement string to be invalid. A replacement |
44 | string may not have the intended semantics, but the interpolation procedure |
45 | itself can never fail. |
46 | */ |
47 | |
48 | use alloc::string::String; |
49 | |
50 | /// Accepts a replacement string and interpolates capture references with their |
51 | /// corresponding values. |
52 | /// |
53 | /// `append` should be a function that appends the string value of a capture |
54 | /// group at a particular index to the string given. If the capture group |
55 | /// index is invalid, then nothing should be appended. |
56 | /// |
57 | /// `name_to_index` should be a function that maps a capture group name to a |
58 | /// capture group index. If the given name doesn't exist, then `None` should |
59 | /// be returned. |
60 | /// |
61 | /// Finally, `dst` is where the final interpolated contents should be written. |
62 | /// If `replacement` contains no capture group references, then `dst` will be |
63 | /// equivalent to `replacement`. |
64 | /// |
65 | /// See the [module documentation](self) for details about the format |
66 | /// supported. |
67 | pub fn string( |
68 | mut replacement: &str, |
69 | mut append: impl FnMut(usize, &mut String), |
70 | mut name_to_index: impl FnMut(&str) -> Option<usize>, |
71 | dst: &mut String, |
72 | ) { |
73 | while !replacement.is_empty() { |
74 | match replacement.find('$' ) { |
75 | None => break, |
76 | Some(i) => { |
77 | dst.push_str(&replacement[..i]); |
78 | replacement = &replacement[i..]; |
79 | } |
80 | } |
81 | // Handle escaping of '$'. |
82 | if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$' ) { |
83 | dst.push_str("$" ); |
84 | replacement = &replacement[2..]; |
85 | continue; |
86 | } |
87 | debug_assert!(!replacement.is_empty()); |
88 | let cap_ref = match find_cap_ref(replacement.as_bytes()) { |
89 | Some(cap_ref) => cap_ref, |
90 | None => { |
91 | dst.push_str("$" ); |
92 | replacement = &replacement[1..]; |
93 | continue; |
94 | } |
95 | }; |
96 | replacement = &replacement[cap_ref.end..]; |
97 | match cap_ref.cap { |
98 | Ref::Number(i) => append(i, dst), |
99 | Ref::Named(name) => { |
100 | if let Some(i) = name_to_index(name) { |
101 | append(i, dst); |
102 | } |
103 | } |
104 | } |
105 | } |
106 | dst.push_str(replacement); |
107 | } |
108 | |
109 | /* |
110 | This should be uncommented and used if we ever provide public APIs for |
111 | searching `&[u8]`. |
112 | |
113 | /// Accepts a replacement byte string and interpolates capture references with |
114 | /// their corresponding values. |
115 | /// |
116 | /// `append` should be a function that appends the byte string value of a |
117 | /// capture group at a particular index to the byte string given. If the |
118 | /// capture group index is invalid, then nothing should be appended. |
119 | /// |
120 | /// `name_to_index` should be a function that maps a capture group name to a |
121 | /// capture group index. If the given name doesn't exist, then `None` should |
122 | /// be returned. |
123 | /// |
124 | /// Finally, `dst` is where the final interpolated contents should be written. |
125 | /// If `replacement` contains no capture group references, then `dst` will be |
126 | /// equivalent to `replacement`. |
127 | /// |
128 | /// See the [module documentation](self) for details about the format |
129 | /// supported. |
130 | pub fn bytes( |
131 | mut replacement: &[u8], |
132 | mut append: impl FnMut(usize, &mut Vec<u8>), |
133 | mut name_to_index: impl FnMut(&str) -> Option<usize>, |
134 | dst: &mut Vec<u8>, |
135 | ) { |
136 | while !replacement.is_empty() { |
137 | match replacement.iter().position(|&b| b == b'$') { |
138 | None => break, |
139 | Some(i) => { |
140 | dst.extend_from_slice(&replacement[..i]); |
141 | replacement = &replacement[i..]; |
142 | } |
143 | } |
144 | // Handle escaping of '$'. |
145 | if replacement.get(1).map_or(false, |&b| b == b'$') { |
146 | dst.push(b'$'); |
147 | replacement = &replacement[2..]; |
148 | continue; |
149 | } |
150 | debug_assert!(!replacement.is_empty()); |
151 | let cap_ref = match find_cap_ref(replacement) { |
152 | Some(cap_ref) => cap_ref, |
153 | None => { |
154 | dst.push(b'$'); |
155 | replacement = &replacement[1..]; |
156 | continue; |
157 | } |
158 | }; |
159 | replacement = &replacement[cap_ref.end..]; |
160 | match cap_ref.cap { |
161 | Ref::Number(i) => append(i, dst), |
162 | Ref::Named(name) => { |
163 | if let Some(i) = name_to_index(name) { |
164 | append(i, dst); |
165 | } |
166 | } |
167 | } |
168 | } |
169 | dst.extend_from_slice(replacement); |
170 | } |
171 | */ |
172 | |
173 | /// `CaptureRef` represents a reference to a capture group inside some text. |
174 | /// The reference is either a capture group name or a number. |
175 | /// |
176 | /// It is also tagged with the position in the text following the |
177 | /// capture reference. |
178 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
179 | struct CaptureRef<'a> { |
180 | cap: Ref<'a>, |
181 | end: usize, |
182 | } |
183 | |
184 | /// A reference to a capture group in some text. |
185 | /// |
186 | /// e.g., `$2`, `$foo`, `${foo}`. |
187 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
188 | enum Ref<'a> { |
189 | Named(&'a str), |
190 | Number(usize), |
191 | } |
192 | |
193 | impl<'a> From<&'a str> for Ref<'a> { |
194 | fn from(x: &'a str) -> Ref<'a> { |
195 | Ref::Named(x) |
196 | } |
197 | } |
198 | |
199 | impl From<usize> for Ref<'static> { |
200 | fn from(x: usize) -> Ref<'static> { |
201 | Ref::Number(x) |
202 | } |
203 | } |
204 | |
205 | /// Parses a possible reference to a capture group name in the given text, |
206 | /// starting at the beginning of `replacement`. |
207 | /// |
208 | /// If no such valid reference could be found, None is returned. |
209 | /// |
210 | /// Note that this returns a "possible" reference because this routine doesn't |
211 | /// know whether the reference is to a valid group or not. If it winds up not |
212 | /// being a valid reference, then it should be replaced with the empty string. |
213 | fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> { |
214 | let mut i = 0; |
215 | let rep: &[u8] = replacement; |
216 | if rep.len() <= 1 || rep[0] != b'$' { |
217 | return None; |
218 | } |
219 | i += 1; |
220 | if rep[i] == b'{' { |
221 | return find_cap_ref_braced(rep, i + 1); |
222 | } |
223 | let mut cap_end = i; |
224 | while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) { |
225 | cap_end += 1; |
226 | } |
227 | if cap_end == i { |
228 | return None; |
229 | } |
230 | // We just verified that the range 0..cap_end is valid ASCII, so it must |
231 | // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 |
232 | // check via an unchecked conversion or by parsing the number straight from |
233 | // &[u8]. |
234 | let cap = core::str::from_utf8(&rep[i..cap_end]) |
235 | .expect("valid UTF-8 capture name" ); |
236 | Some(CaptureRef { |
237 | cap: match cap.parse::<usize>() { |
238 | Ok(i) => Ref::Number(i), |
239 | Err(_) => Ref::Named(cap), |
240 | }, |
241 | end: cap_end, |
242 | }) |
243 | } |
244 | |
245 | /// Looks for a braced reference, e.g., `${foo1}`. This assumes that an opening |
246 | /// brace has been found at `i-1` in `rep`. This then looks for a closing |
247 | /// brace and returns the capture reference within the brace. |
248 | fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> { |
249 | assert_eq!(b'{' , rep[i.checked_sub(1).unwrap()]); |
250 | let start = i; |
251 | while rep.get(i).map_or(false, |&b| b != b'}' ) { |
252 | i += 1; |
253 | } |
254 | if !rep.get(i).map_or(false, |&b| b == b'}' ) { |
255 | return None; |
256 | } |
257 | // When looking at braced names, we don't put any restrictions on the name, |
258 | // so it's possible it could be invalid UTF-8. But a capture group name |
259 | // can never be invalid UTF-8, so if we have invalid UTF-8, then we can |
260 | // safely return None. |
261 | let cap = match core::str::from_utf8(&rep[start..i]) { |
262 | Err(_) => return None, |
263 | Ok(cap) => cap, |
264 | }; |
265 | Some(CaptureRef { |
266 | cap: match cap.parse::<usize>() { |
267 | Ok(i) => Ref::Number(i), |
268 | Err(_) => Ref::Named(cap), |
269 | }, |
270 | end: i + 1, |
271 | }) |
272 | } |
273 | |
274 | /// Returns true if and only if the given byte is allowed in a capture name |
275 | /// written in non-brace form. |
276 | fn is_valid_cap_letter(b: u8) -> bool { |
277 | match b { |
278 | b'0' ..=b'9' | b'a' ..=b'z' | b'A' ..=b'Z' | b'_' => true, |
279 | _ => false, |
280 | } |
281 | } |
282 | |
283 | #[cfg (test)] |
284 | mod tests { |
285 | use alloc::{string::String, vec, vec::Vec}; |
286 | |
287 | use super::{find_cap_ref, CaptureRef}; |
288 | |
289 | macro_rules! find { |
290 | ($name:ident, $text:expr) => { |
291 | #[test] |
292 | fn $name() { |
293 | assert_eq!(None, find_cap_ref($text.as_bytes())); |
294 | } |
295 | }; |
296 | ($name:ident, $text:expr, $capref:expr) => { |
297 | #[test] |
298 | fn $name() { |
299 | assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); |
300 | } |
301 | }; |
302 | } |
303 | |
304 | macro_rules! c { |
305 | ($name_or_number:expr, $pos:expr) => { |
306 | CaptureRef { cap: $name_or_number.into(), end: $pos } |
307 | }; |
308 | } |
309 | |
310 | find!(find_cap_ref1, "$foo" , c!("foo" , 4)); |
311 | find!(find_cap_ref2, "${foo}" , c!("foo" , 6)); |
312 | find!(find_cap_ref3, "$0" , c!(0, 2)); |
313 | find!(find_cap_ref4, "$5" , c!(5, 2)); |
314 | find!(find_cap_ref5, "$10" , c!(10, 3)); |
315 | // See https://github.com/rust-lang/regex/pull/585 |
316 | // for more on characters following numbers |
317 | find!(find_cap_ref6, "$42a" , c!("42a" , 4)); |
318 | find!(find_cap_ref7, "${42}a" , c!(42, 5)); |
319 | find!(find_cap_ref8, "${42" ); |
320 | find!(find_cap_ref9, "${42 " ); |
321 | find!(find_cap_ref10, " $0 " ); |
322 | find!(find_cap_ref11, "$" ); |
323 | find!(find_cap_ref12, " " ); |
324 | find!(find_cap_ref13, "" ); |
325 | find!(find_cap_ref14, "$1-$2" , c!(1, 2)); |
326 | find!(find_cap_ref15, "$1_$2" , c!("1_" , 3)); |
327 | find!(find_cap_ref16, "$x-$y" , c!("x" , 2)); |
328 | find!(find_cap_ref17, "$x_$y" , c!("x_" , 3)); |
329 | find!(find_cap_ref18, "${#}" , c!("#" , 4)); |
330 | find!(find_cap_ref19, "${Z[}" , c!("Z[" , 5)); |
331 | find!(find_cap_ref20, "${¾}" , c!("¾" , 5)); |
332 | find!(find_cap_ref21, "${¾a}" , c!("¾a" , 6)); |
333 | find!(find_cap_ref22, "${a¾}" , c!("a¾" , 6)); |
334 | find!(find_cap_ref23, "${☃}" , c!("☃" , 6)); |
335 | find!(find_cap_ref24, "${a☃}" , c!("a☃" , 7)); |
336 | find!(find_cap_ref25, "${☃a}" , c!("☃a" , 7)); |
337 | find!(find_cap_ref26, "${名字}" , c!("名字" , 9)); |
338 | |
339 | fn interpolate_string( |
340 | mut name_to_index: Vec<(&'static str, usize)>, |
341 | caps: Vec<&'static str>, |
342 | replacement: &str, |
343 | ) -> String { |
344 | name_to_index.sort_by_key(|x| x.0); |
345 | |
346 | let mut dst = String::new(); |
347 | super::string( |
348 | replacement, |
349 | |i, dst| { |
350 | if let Some(&s) = caps.get(i) { |
351 | dst.push_str(s); |
352 | } |
353 | }, |
354 | |name| -> Option<usize> { |
355 | name_to_index |
356 | .binary_search_by_key(&name, |x| x.0) |
357 | .ok() |
358 | .map(|i| name_to_index[i].1) |
359 | }, |
360 | &mut dst, |
361 | ); |
362 | dst |
363 | } |
364 | |
365 | /* |
366 | fn interpolate_bytes( |
367 | mut name_to_index: Vec<(&'static str, usize)>, |
368 | caps: Vec<&'static str>, |
369 | replacement: &str, |
370 | ) -> String { |
371 | name_to_index.sort_by_key(|x| x.0); |
372 | |
373 | let mut dst = vec![]; |
374 | super::bytes( |
375 | replacement.as_bytes(), |
376 | |i, dst| { |
377 | if let Some(&s) = caps.get(i) { |
378 | dst.extend_from_slice(s.as_bytes()); |
379 | } |
380 | }, |
381 | |name| -> Option<usize> { |
382 | name_to_index |
383 | .binary_search_by_key(&name, |x| x.0) |
384 | .ok() |
385 | .map(|i| name_to_index[i].1) |
386 | }, |
387 | &mut dst, |
388 | ); |
389 | String::from_utf8(dst).unwrap() |
390 | } |
391 | */ |
392 | |
393 | macro_rules! interp { |
394 | ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => { |
395 | #[test] |
396 | fn $name() { |
397 | assert_eq!( |
398 | $expected, |
399 | interpolate_string($map, $caps, $hay), |
400 | "interpolate::string failed" , |
401 | ); |
402 | /* |
403 | assert_eq!( |
404 | $expected, |
405 | interpolate_bytes($map, $caps, $hay), |
406 | "interpolate::bytes failed", |
407 | ); |
408 | */ |
409 | } |
410 | }; |
411 | } |
412 | |
413 | interp!( |
414 | interp1, |
415 | vec![("foo" , 2)], |
416 | vec!["" , "" , "xxx" ], |
417 | "test $foo test" , |
418 | "test xxx test" , |
419 | ); |
420 | |
421 | interp!( |
422 | interp2, |
423 | vec![("foo" , 2)], |
424 | vec!["" , "" , "xxx" ], |
425 | "test$footest" , |
426 | "test" , |
427 | ); |
428 | |
429 | interp!( |
430 | interp3, |
431 | vec![("foo" , 2)], |
432 | vec!["" , "" , "xxx" ], |
433 | "test${foo}test" , |
434 | "testxxxtest" , |
435 | ); |
436 | |
437 | interp!( |
438 | interp4, |
439 | vec![("foo" , 2)], |
440 | vec!["" , "" , "xxx" ], |
441 | "test$2test" , |
442 | "test" , |
443 | ); |
444 | |
445 | interp!( |
446 | interp5, |
447 | vec![("foo" , 2)], |
448 | vec!["" , "" , "xxx" ], |
449 | "test${2}test" , |
450 | "testxxxtest" , |
451 | ); |
452 | |
453 | interp!( |
454 | interp6, |
455 | vec![("foo" , 2)], |
456 | vec!["" , "" , "xxx" ], |
457 | "test $$foo test" , |
458 | "test $foo test" , |
459 | ); |
460 | |
461 | interp!( |
462 | interp7, |
463 | vec![("foo" , 2)], |
464 | vec!["" , "" , "xxx" ], |
465 | "test $foo" , |
466 | "test xxx" , |
467 | ); |
468 | |
469 | interp!( |
470 | interp8, |
471 | vec![("foo" , 2)], |
472 | vec!["" , "" , "xxx" ], |
473 | "$foo test" , |
474 | "xxx test" , |
475 | ); |
476 | |
477 | interp!( |
478 | interp9, |
479 | vec![("bar" , 1), ("foo" , 2)], |
480 | vec!["" , "yyy" , "xxx" ], |
481 | "test $bar$foo" , |
482 | "test yyyxxx" , |
483 | ); |
484 | |
485 | interp!( |
486 | interp10, |
487 | vec![("bar" , 1), ("foo" , 2)], |
488 | vec!["" , "yyy" , "xxx" ], |
489 | "test $ test" , |
490 | "test $ test" , |
491 | ); |
492 | |
493 | interp!( |
494 | interp11, |
495 | vec![("bar" , 1), ("foo" , 2)], |
496 | vec!["" , "yyy" , "xxx" ], |
497 | "test ${} test" , |
498 | "test test" , |
499 | ); |
500 | |
501 | interp!( |
502 | interp12, |
503 | vec![("bar" , 1), ("foo" , 2)], |
504 | vec!["" , "yyy" , "xxx" ], |
505 | "test ${ } test" , |
506 | "test test" , |
507 | ); |
508 | |
509 | interp!( |
510 | interp13, |
511 | vec![("bar" , 1), ("foo" , 2)], |
512 | vec!["" , "yyy" , "xxx" ], |
513 | "test ${a b} test" , |
514 | "test test" , |
515 | ); |
516 | |
517 | interp!( |
518 | interp14, |
519 | vec![("bar" , 1), ("foo" , 2)], |
520 | vec!["" , "yyy" , "xxx" ], |
521 | "test ${a} test" , |
522 | "test test" , |
523 | ); |
524 | |
525 | // This is a funny case where a braced reference is never closed, but |
526 | // within the unclosed braced reference, there is an unbraced reference. |
527 | // In this case, the braced reference is just treated literally and the |
528 | // unbraced reference is found. |
529 | interp!( |
530 | interp15, |
531 | vec![("bar" , 1), ("foo" , 2)], |
532 | vec!["" , "yyy" , "xxx" ], |
533 | "test ${wat $bar ok" , |
534 | "test ${wat yyy ok" , |
535 | ); |
536 | } |
537 | |