1 | /*! |
2 | Provides routines for interpolating capture group references. |
3 | |
4 | That is, if a replacement string contains references like `$foo` or `${foo1}`, |
5 | then they are replaced with the corresponding capture values for the groups |
6 | named `foo` and `foo1`, respectively. Similarly, syntax like `$1` and `${1}` |
7 | is supported as well, with `1` corresponding to a capture group index and not |
8 | a name. |
9 | |
10 | This module provides the free functions [`string`] and [`bytes`], which |
11 | interpolate Rust Unicode strings and byte strings, respectively. |
12 | |
13 | # Format |
14 | |
15 | These routines support two different kinds of capture references: unbraced and |
16 | braced. |
17 | |
18 | For the unbraced format, the format supported is `$ref` where `name` can be |
19 | any character in the class `[0-9A-Za-z_]`. `ref` is always the longest |
20 | possible parse. So for example, `$1a` corresponds to the capture group named |
21 | `1a` and not the capture group at index `1`. If `ref` matches `^[0-9]+$`, then |
22 | it is treated as a capture group index itself and not a name. |
23 | |
24 | For the braced format, the format supported is `${ref}` where `ref` can be any |
25 | sequence of bytes except for `}`. If no closing brace occurs, then it is not |
26 | considered a capture reference. As with the unbraced format, if `ref` matches |
27 | `^[0-9]+$`, then it is treated as a capture group index and not a name. |
28 | |
29 | The braced format is useful for exerting precise control over the name of the |
30 | capture reference. For example, `${1}a` corresponds to the capture group |
31 | reference `1` followed by the letter `a`, where as `$1a` (as mentioned above) |
32 | corresponds to the capture group reference `1a`. The braced format is also |
33 | useful for expressing capture group names that use characters not supported by |
34 | the unbraced format. For example, `${foo[bar].baz}` refers to the capture group |
35 | named `foo[bar].baz`. |
36 | |
37 | If a capture group reference is found and it does not refer to a valid capture |
38 | group, then it will be replaced with the empty string. |
39 | |
40 | To write a literal `$`, use `$$`. |
41 | |
42 | To be clear, and as exhibited via the type signatures in the routines in this |
43 | module, it is impossible for a replacement string to be invalid. A replacement |
44 | string may not have the intended semantics, but the interpolation procedure |
45 | itself can never fail. |
46 | */ |
47 | |
48 | use alloc::{string::String, vec::Vec}; |
49 | |
50 | use crate::util::memchr::memchr; |
51 | |
52 | /// Accepts a replacement string and interpolates capture references with their |
53 | /// corresponding values. |
54 | /// |
55 | /// `append` should be a function that appends the string value of a capture |
56 | /// group at a particular index to the string given. If the capture group |
57 | /// index is invalid, then nothing should be appended. |
58 | /// |
59 | /// `name_to_index` should be a function that maps a capture group name to a |
60 | /// capture group index. If the given name doesn't exist, then `None` should |
61 | /// be returned. |
62 | /// |
63 | /// Finally, `dst` is where the final interpolated contents should be written. |
64 | /// If `replacement` contains no capture group references, then `dst` will be |
65 | /// equivalent to `replacement`. |
66 | /// |
67 | /// See the [module documentation](self) for details about the format |
68 | /// supported. |
69 | /// |
70 | /// # Example |
71 | /// |
72 | /// ``` |
73 | /// use regex_automata::util::interpolate; |
74 | /// |
75 | /// let mut dst = String::new(); |
76 | /// interpolate::string( |
77 | /// "foo $bar baz" , |
78 | /// |index, dst| { |
79 | /// if index == 0 { |
80 | /// dst.push_str("BAR" ); |
81 | /// } |
82 | /// }, |
83 | /// |name| { |
84 | /// if name == "bar" { |
85 | /// Some(0) |
86 | /// } else { |
87 | /// None |
88 | /// } |
89 | /// }, |
90 | /// &mut dst, |
91 | /// ); |
92 | /// assert_eq!("foo BAR baz" , dst); |
93 | /// ``` |
94 | pub fn string( |
95 | mut replacement: &str, |
96 | mut append: impl FnMut(usize, &mut String), |
97 | mut name_to_index: impl FnMut(&str) -> Option<usize>, |
98 | dst: &mut String, |
99 | ) { |
100 | while !replacement.is_empty() { |
101 | match memchr(b'$' , replacement.as_bytes()) { |
102 | None => break, |
103 | Some(i) => { |
104 | dst.push_str(&replacement[..i]); |
105 | replacement = &replacement[i..]; |
106 | } |
107 | } |
108 | // Handle escaping of '$'. |
109 | if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$' ) { |
110 | dst.push_str("$" ); |
111 | replacement = &replacement[2..]; |
112 | continue; |
113 | } |
114 | debug_assert!(!replacement.is_empty()); |
115 | let cap_ref = match find_cap_ref(replacement.as_bytes()) { |
116 | Some(cap_ref) => cap_ref, |
117 | None => { |
118 | dst.push_str("$" ); |
119 | replacement = &replacement[1..]; |
120 | continue; |
121 | } |
122 | }; |
123 | replacement = &replacement[cap_ref.end..]; |
124 | match cap_ref.cap { |
125 | Ref::Number(i) => append(i, dst), |
126 | Ref::Named(name) => { |
127 | if let Some(i) = name_to_index(name) { |
128 | append(i, dst); |
129 | } |
130 | } |
131 | } |
132 | } |
133 | dst.push_str(replacement); |
134 | } |
135 | |
136 | /// Accepts a replacement byte string and interpolates capture references with |
137 | /// their corresponding values. |
138 | /// |
139 | /// `append` should be a function that appends the byte string value of a |
140 | /// capture group at a particular index to the byte string given. If the |
141 | /// capture group index is invalid, then nothing should be appended. |
142 | /// |
143 | /// `name_to_index` should be a function that maps a capture group name to a |
144 | /// capture group index. If the given name doesn't exist, then `None` should |
145 | /// be returned. |
146 | /// |
147 | /// Finally, `dst` is where the final interpolated contents should be written. |
148 | /// If `replacement` contains no capture group references, then `dst` will be |
149 | /// equivalent to `replacement`. |
150 | /// |
151 | /// See the [module documentation](self) for details about the format |
152 | /// supported. |
153 | /// |
154 | /// # Example |
155 | /// |
156 | /// ``` |
157 | /// use regex_automata::util::interpolate; |
158 | /// |
159 | /// let mut dst = vec![]; |
160 | /// interpolate::bytes( |
161 | /// b"foo $bar baz" , |
162 | /// |index, dst| { |
163 | /// if index == 0 { |
164 | /// dst.extend_from_slice(b"BAR" ); |
165 | /// } |
166 | /// }, |
167 | /// |name| { |
168 | /// if name == "bar" { |
169 | /// Some(0) |
170 | /// } else { |
171 | /// None |
172 | /// } |
173 | /// }, |
174 | /// &mut dst, |
175 | /// ); |
176 | /// assert_eq!(&b"foo BAR baz" [..], dst); |
177 | /// ``` |
178 | pub fn bytes( |
179 | mut replacement: &[u8], |
180 | mut append: impl FnMut(usize, &mut Vec<u8>), |
181 | mut name_to_index: impl FnMut(&str) -> Option<usize>, |
182 | dst: &mut Vec<u8>, |
183 | ) { |
184 | while !replacement.is_empty() { |
185 | match memchr(b'$' , replacement) { |
186 | None => break, |
187 | Some(i) => { |
188 | dst.extend_from_slice(&replacement[..i]); |
189 | replacement = &replacement[i..]; |
190 | } |
191 | } |
192 | // Handle escaping of '$'. |
193 | if replacement.get(1).map_or(false, |&b| b == b'$' ) { |
194 | dst.push(b'$' ); |
195 | replacement = &replacement[2..]; |
196 | continue; |
197 | } |
198 | debug_assert!(!replacement.is_empty()); |
199 | let cap_ref = match find_cap_ref(replacement) { |
200 | Some(cap_ref) => cap_ref, |
201 | None => { |
202 | dst.push(b'$' ); |
203 | replacement = &replacement[1..]; |
204 | continue; |
205 | } |
206 | }; |
207 | replacement = &replacement[cap_ref.end..]; |
208 | match cap_ref.cap { |
209 | Ref::Number(i) => append(i, dst), |
210 | Ref::Named(name) => { |
211 | if let Some(i) = name_to_index(name) { |
212 | append(i, dst); |
213 | } |
214 | } |
215 | } |
216 | } |
217 | dst.extend_from_slice(replacement); |
218 | } |
219 | |
220 | /// `CaptureRef` represents a reference to a capture group inside some text. |
221 | /// The reference is either a capture group name or a number. |
222 | /// |
223 | /// It is also tagged with the position in the text following the |
224 | /// capture reference. |
225 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
226 | struct CaptureRef<'a> { |
227 | cap: Ref<'a>, |
228 | end: usize, |
229 | } |
230 | |
231 | /// A reference to a capture group in some text. |
232 | /// |
233 | /// e.g., `$2`, `$foo`, `${foo}`. |
234 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
235 | enum Ref<'a> { |
236 | Named(&'a str), |
237 | Number(usize), |
238 | } |
239 | |
240 | impl<'a> From<&'a str> for Ref<'a> { |
241 | fn from(x: &'a str) -> Ref<'a> { |
242 | Ref::Named(x) |
243 | } |
244 | } |
245 | |
246 | impl From<usize> for Ref<'static> { |
247 | fn from(x: usize) -> Ref<'static> { |
248 | Ref::Number(x) |
249 | } |
250 | } |
251 | |
252 | /// Parses a possible reference to a capture group name in the given text, |
253 | /// starting at the beginning of `replacement`. |
254 | /// |
255 | /// If no such valid reference could be found, None is returned. |
256 | /// |
257 | /// Note that this returns a "possible" reference because this routine doesn't |
258 | /// know whether the reference is to a valid group or not. If it winds up not |
259 | /// being a valid reference, then it should be replaced with the empty string. |
260 | fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> { |
261 | let mut i = 0; |
262 | let rep: &[u8] = replacement; |
263 | if rep.len() <= 1 || rep[0] != b'$' { |
264 | return None; |
265 | } |
266 | i += 1; |
267 | if rep[i] == b'{' { |
268 | return find_cap_ref_braced(rep, i + 1); |
269 | } |
270 | let mut cap_end = i; |
271 | while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) { |
272 | cap_end += 1; |
273 | } |
274 | if cap_end == i { |
275 | return None; |
276 | } |
277 | // We just verified that the range 0..cap_end is valid ASCII, so it must |
278 | // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 |
279 | // check via an unchecked conversion or by parsing the number straight from |
280 | // &[u8]. |
281 | let cap = core::str::from_utf8(&rep[i..cap_end]) |
282 | .expect("valid UTF-8 capture name" ); |
283 | Some(CaptureRef { |
284 | cap: match cap.parse::<usize>() { |
285 | Ok(i) => Ref::Number(i), |
286 | Err(_) => Ref::Named(cap), |
287 | }, |
288 | end: cap_end, |
289 | }) |
290 | } |
291 | |
292 | /// Looks for a braced reference, e.g., `${foo1}`. This assumes that an opening |
293 | /// brace has been found at `i-1` in `rep`. This then looks for a closing |
294 | /// brace and returns the capture reference within the brace. |
295 | fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> { |
296 | assert_eq!(b'{' , rep[i.checked_sub(1).unwrap()]); |
297 | let start = i; |
298 | while rep.get(i).map_or(false, |&b| b != b'}' ) { |
299 | i += 1; |
300 | } |
301 | if !rep.get(i).map_or(false, |&b| b == b'}' ) { |
302 | return None; |
303 | } |
304 | // When looking at braced names, we don't put any restrictions on the name, |
305 | // so it's possible it could be invalid UTF-8. But a capture group name |
306 | // can never be invalid UTF-8, so if we have invalid UTF-8, then we can |
307 | // safely return None. |
308 | let cap = match core::str::from_utf8(&rep[start..i]) { |
309 | Err(_) => return None, |
310 | Ok(cap) => cap, |
311 | }; |
312 | Some(CaptureRef { |
313 | cap: match cap.parse::<usize>() { |
314 | Ok(i) => Ref::Number(i), |
315 | Err(_) => Ref::Named(cap), |
316 | }, |
317 | end: i + 1, |
318 | }) |
319 | } |
320 | |
321 | /// Returns true if and only if the given byte is allowed in a capture name |
322 | /// written in non-brace form. |
323 | fn is_valid_cap_letter(b: u8) -> bool { |
324 | match b { |
325 | b'0' ..=b'9' | b'a' ..=b'z' | b'A' ..=b'Z' | b'_' => true, |
326 | _ => false, |
327 | } |
328 | } |
329 | |
330 | #[cfg (test)] |
331 | mod tests { |
332 | use alloc::{string::String, vec, vec::Vec}; |
333 | |
334 | use super::{find_cap_ref, CaptureRef}; |
335 | |
336 | macro_rules! find { |
337 | ($name:ident, $text:expr) => { |
338 | #[test] |
339 | fn $name() { |
340 | assert_eq!(None, find_cap_ref($text.as_bytes())); |
341 | } |
342 | }; |
343 | ($name:ident, $text:expr, $capref:expr) => { |
344 | #[test] |
345 | fn $name() { |
346 | assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); |
347 | } |
348 | }; |
349 | } |
350 | |
351 | macro_rules! c { |
352 | ($name_or_number:expr, $pos:expr) => { |
353 | CaptureRef { cap: $name_or_number.into(), end: $pos } |
354 | }; |
355 | } |
356 | |
357 | find!(find_cap_ref1, "$foo" , c!("foo" , 4)); |
358 | find!(find_cap_ref2, "${foo}" , c!("foo" , 6)); |
359 | find!(find_cap_ref3, "$0" , c!(0, 2)); |
360 | find!(find_cap_ref4, "$5" , c!(5, 2)); |
361 | find!(find_cap_ref5, "$10" , c!(10, 3)); |
362 | // See https://github.com/rust-lang/regex/pull/585 |
363 | // for more on characters following numbers |
364 | find!(find_cap_ref6, "$42a" , c!("42a" , 4)); |
365 | find!(find_cap_ref7, "${42}a" , c!(42, 5)); |
366 | find!(find_cap_ref8, "${42" ); |
367 | find!(find_cap_ref9, "${42 " ); |
368 | find!(find_cap_ref10, " $0 " ); |
369 | find!(find_cap_ref11, "$" ); |
370 | find!(find_cap_ref12, " " ); |
371 | find!(find_cap_ref13, "" ); |
372 | find!(find_cap_ref14, "$1-$2" , c!(1, 2)); |
373 | find!(find_cap_ref15, "$1_$2" , c!("1_" , 3)); |
374 | find!(find_cap_ref16, "$x-$y" , c!("x" , 2)); |
375 | find!(find_cap_ref17, "$x_$y" , c!("x_" , 3)); |
376 | find!(find_cap_ref18, "${#}" , c!("#" , 4)); |
377 | find!(find_cap_ref19, "${Z[}" , c!("Z[" , 5)); |
378 | find!(find_cap_ref20, "${¾}" , c!("¾" , 5)); |
379 | find!(find_cap_ref21, "${¾a}" , c!("¾a" , 6)); |
380 | find!(find_cap_ref22, "${a¾}" , c!("a¾" , 6)); |
381 | find!(find_cap_ref23, "${☃}" , c!("☃" , 6)); |
382 | find!(find_cap_ref24, "${a☃}" , c!("a☃" , 7)); |
383 | find!(find_cap_ref25, "${☃a}" , c!("☃a" , 7)); |
384 | find!(find_cap_ref26, "${名字}" , c!("名字" , 9)); |
385 | |
386 | fn interpolate_string( |
387 | mut name_to_index: Vec<(&'static str, usize)>, |
388 | caps: Vec<&'static str>, |
389 | replacement: &str, |
390 | ) -> String { |
391 | name_to_index.sort_by_key(|x| x.0); |
392 | |
393 | let mut dst = String::new(); |
394 | super::string( |
395 | replacement, |
396 | |i, dst| { |
397 | if let Some(&s) = caps.get(i) { |
398 | dst.push_str(s); |
399 | } |
400 | }, |
401 | |name| -> Option<usize> { |
402 | name_to_index |
403 | .binary_search_by_key(&name, |x| x.0) |
404 | .ok() |
405 | .map(|i| name_to_index[i].1) |
406 | }, |
407 | &mut dst, |
408 | ); |
409 | dst |
410 | } |
411 | |
412 | fn interpolate_bytes( |
413 | mut name_to_index: Vec<(&'static str, usize)>, |
414 | caps: Vec<&'static str>, |
415 | replacement: &str, |
416 | ) -> String { |
417 | name_to_index.sort_by_key(|x| x.0); |
418 | |
419 | let mut dst = vec![]; |
420 | super::bytes( |
421 | replacement.as_bytes(), |
422 | |i, dst| { |
423 | if let Some(&s) = caps.get(i) { |
424 | dst.extend_from_slice(s.as_bytes()); |
425 | } |
426 | }, |
427 | |name| -> Option<usize> { |
428 | name_to_index |
429 | .binary_search_by_key(&name, |x| x.0) |
430 | .ok() |
431 | .map(|i| name_to_index[i].1) |
432 | }, |
433 | &mut dst, |
434 | ); |
435 | String::from_utf8(dst).unwrap() |
436 | } |
437 | |
438 | macro_rules! interp { |
439 | ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => { |
440 | #[test] |
441 | fn $name() { |
442 | assert_eq!( |
443 | $expected, |
444 | interpolate_string($map, $caps, $hay), |
445 | "interpolate::string failed" , |
446 | ); |
447 | assert_eq!( |
448 | $expected, |
449 | interpolate_bytes($map, $caps, $hay), |
450 | "interpolate::bytes failed" , |
451 | ); |
452 | } |
453 | }; |
454 | } |
455 | |
456 | interp!( |
457 | interp1, |
458 | vec![("foo" , 2)], |
459 | vec!["" , "" , "xxx" ], |
460 | "test $foo test" , |
461 | "test xxx test" , |
462 | ); |
463 | |
464 | interp!( |
465 | interp2, |
466 | vec![("foo" , 2)], |
467 | vec!["" , "" , "xxx" ], |
468 | "test$footest" , |
469 | "test" , |
470 | ); |
471 | |
472 | interp!( |
473 | interp3, |
474 | vec![("foo" , 2)], |
475 | vec!["" , "" , "xxx" ], |
476 | "test${foo}test" , |
477 | "testxxxtest" , |
478 | ); |
479 | |
480 | interp!( |
481 | interp4, |
482 | vec![("foo" , 2)], |
483 | vec!["" , "" , "xxx" ], |
484 | "test$2test" , |
485 | "test" , |
486 | ); |
487 | |
488 | interp!( |
489 | interp5, |
490 | vec![("foo" , 2)], |
491 | vec!["" , "" , "xxx" ], |
492 | "test${2}test" , |
493 | "testxxxtest" , |
494 | ); |
495 | |
496 | interp!( |
497 | interp6, |
498 | vec![("foo" , 2)], |
499 | vec!["" , "" , "xxx" ], |
500 | "test $$foo test" , |
501 | "test $foo test" , |
502 | ); |
503 | |
504 | interp!( |
505 | interp7, |
506 | vec![("foo" , 2)], |
507 | vec!["" , "" , "xxx" ], |
508 | "test $foo" , |
509 | "test xxx" , |
510 | ); |
511 | |
512 | interp!( |
513 | interp8, |
514 | vec![("foo" , 2)], |
515 | vec!["" , "" , "xxx" ], |
516 | "$foo test" , |
517 | "xxx test" , |
518 | ); |
519 | |
520 | interp!( |
521 | interp9, |
522 | vec![("bar" , 1), ("foo" , 2)], |
523 | vec!["" , "yyy" , "xxx" ], |
524 | "test $bar$foo" , |
525 | "test yyyxxx" , |
526 | ); |
527 | |
528 | interp!( |
529 | interp10, |
530 | vec![("bar" , 1), ("foo" , 2)], |
531 | vec!["" , "yyy" , "xxx" ], |
532 | "test $ test" , |
533 | "test $ test" , |
534 | ); |
535 | |
536 | interp!( |
537 | interp11, |
538 | vec![("bar" , 1), ("foo" , 2)], |
539 | vec!["" , "yyy" , "xxx" ], |
540 | "test ${} test" , |
541 | "test test" , |
542 | ); |
543 | |
544 | interp!( |
545 | interp12, |
546 | vec![("bar" , 1), ("foo" , 2)], |
547 | vec!["" , "yyy" , "xxx" ], |
548 | "test ${ } test" , |
549 | "test test" , |
550 | ); |
551 | |
552 | interp!( |
553 | interp13, |
554 | vec![("bar" , 1), ("foo" , 2)], |
555 | vec!["" , "yyy" , "xxx" ], |
556 | "test ${a b} test" , |
557 | "test test" , |
558 | ); |
559 | |
560 | interp!( |
561 | interp14, |
562 | vec![("bar" , 1), ("foo" , 2)], |
563 | vec!["" , "yyy" , "xxx" ], |
564 | "test ${a} test" , |
565 | "test test" , |
566 | ); |
567 | |
568 | // This is a funny case where a braced reference is never closed, but |
569 | // within the unclosed braced reference, there is an unbraced reference. |
570 | // In this case, the braced reference is just treated literally and the |
571 | // unbraced reference is found. |
572 | interp!( |
573 | interp15, |
574 | vec![("bar" , 1), ("foo" , 2)], |
575 | vec!["" , "yyy" , "xxx" ], |
576 | "test ${wat $bar ok" , |
577 | "test ${wat yyy ok" , |
578 | ); |
579 | } |
580 | |