1use std::str;
2
3use crate::find_byte::find_byte;
4
5use crate::re_bytes;
6use crate::re_unicode;
7
8pub fn expand_str(
9 caps: &re_unicode::Captures<'_>,
10 mut replacement: &str,
11 dst: &mut String,
12) {
13 while !replacement.is_empty() {
14 match find_byte(b'$', replacement.as_bytes()) {
15 None => break,
16 Some(i) => {
17 dst.push_str(&replacement[..i]);
18 replacement = &replacement[i..];
19 }
20 }
21 if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') {
22 dst.push_str("$");
23 replacement = &replacement[2..];
24 continue;
25 }
26 debug_assert!(!replacement.is_empty());
27 let cap_ref = match find_cap_ref(replacement.as_bytes()) {
28 Some(cap_ref) => cap_ref,
29 None => {
30 dst.push_str("$");
31 replacement = &replacement[1..];
32 continue;
33 }
34 };
35 replacement = &replacement[cap_ref.end..];
36 match cap_ref.cap {
37 Ref::Number(i) => {
38 dst.push_str(caps.get(i).map(|m| m.as_str()).unwrap_or(""));
39 }
40 Ref::Named(name) => {
41 dst.push_str(
42 caps.name(name).map(|m| m.as_str()).unwrap_or(""),
43 );
44 }
45 }
46 }
47 dst.push_str(replacement);
48}
49
50pub fn expand_bytes(
51 caps: &re_bytes::Captures<'_>,
52 mut replacement: &[u8],
53 dst: &mut Vec<u8>,
54) {
55 while !replacement.is_empty() {
56 match find_byte(b'$', replacement) {
57 None => break,
58 Some(i) => {
59 dst.extend(&replacement[..i]);
60 replacement = &replacement[i..];
61 }
62 }
63 if replacement.get(1).map_or(false, |&b| b == b'$') {
64 dst.push(b'$');
65 replacement = &replacement[2..];
66 continue;
67 }
68 debug_assert!(!replacement.is_empty());
69 let cap_ref = match find_cap_ref(replacement) {
70 Some(cap_ref) => cap_ref,
71 None => {
72 dst.push(b'$');
73 replacement = &replacement[1..];
74 continue;
75 }
76 };
77 replacement = &replacement[cap_ref.end..];
78 match cap_ref.cap {
79 Ref::Number(i) => {
80 dst.extend(caps.get(i).map(|m| m.as_bytes()).unwrap_or(b""));
81 }
82 Ref::Named(name) => {
83 dst.extend(
84 caps.name(name).map(|m| m.as_bytes()).unwrap_or(b""),
85 );
86 }
87 }
88 }
89 dst.extend(replacement);
90}
91
92/// `CaptureRef` represents a reference to a capture group inside some text.
93/// The reference is either a capture group name or a number.
94///
95/// It is also tagged with the position in the text following the
96/// capture reference.
97#[derive(Clone, Copy, Debug, Eq, PartialEq)]
98struct CaptureRef<'a> {
99 cap: Ref<'a>,
100 end: usize,
101}
102
103/// A reference to a capture group in some text.
104///
105/// e.g., `$2`, `$foo`, `${foo}`.
106#[derive(Clone, Copy, Debug, Eq, PartialEq)]
107enum Ref<'a> {
108 Named(&'a str),
109 Number(usize),
110}
111
112impl<'a> From<&'a str> for Ref<'a> {
113 fn from(x: &'a str) -> Ref<'a> {
114 Ref::Named(x)
115 }
116}
117
118impl From<usize> for Ref<'static> {
119 fn from(x: usize) -> Ref<'static> {
120 Ref::Number(x)
121 }
122}
123
124/// Parses a possible reference to a capture group name in the given text,
125/// starting at the beginning of `replacement`.
126///
127/// If no such valid reference could be found, None is returned.
128fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> {
129 let mut i = 0;
130 let rep: &[u8] = replacement;
131 if rep.len() <= 1 || rep[0] != b'$' {
132 return None;
133 }
134 i += 1;
135 if rep[i] == b'{' {
136 return find_cap_ref_braced(rep, i + 1);
137 }
138 let mut cap_end = i;
139 while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) {
140 cap_end += 1;
141 }
142 if cap_end == i {
143 return None;
144 }
145 // We just verified that the range 0..cap_end is valid ASCII, so it must
146 // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
147 // check via an unchecked conversion or by parsing the number straight from
148 // &[u8].
149 let cap =
150 str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name");
151 Some(CaptureRef {
152 cap: match cap.parse::<u32>() {
153 Ok(i) => Ref::Number(i as usize),
154 Err(_) => Ref::Named(cap),
155 },
156 end: cap_end,
157 })
158}
159
160fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> {
161 let start: usize = i;
162 while rep.get(i).map_or(default:false, |&b: u8| b != b'}') {
163 i += 1;
164 }
165 if !rep.get(i).map_or(default:false, |&b: u8| b == b'}') {
166 return None;
167 }
168 // When looking at braced names, we don't put any restrictions on the name,
169 // so it's possible it could be invalid UTF-8. But a capture group name
170 // can never be invalid UTF-8, so if we have invalid UTF-8, then we can
171 // safely return None.
172 let cap: &str = match str::from_utf8(&rep[start..i]) {
173 Err(_) => return None,
174 Ok(cap: &str) => cap,
175 };
176 Some(CaptureRef {
177 cap: match cap.parse::<u32>() {
178 Ok(i: u32) => Ref::Number(i as usize),
179 Err(_) => Ref::Named(cap),
180 },
181 end: i + 1,
182 })
183}
184
185/// Returns true if and only if the given byte is allowed in a capture name
186/// written in non-brace form.
187fn is_valid_cap_letter(b: u8) -> bool {
188 match b {
189 b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
190 _ => false,
191 }
192}
193
194#[cfg(test)]
195mod tests {
196 use super::{find_cap_ref, CaptureRef};
197
198 macro_rules! find {
199 ($name:ident, $text:expr) => {
200 #[test]
201 fn $name() {
202 assert_eq!(None, find_cap_ref($text.as_bytes()));
203 }
204 };
205 ($name:ident, $text:expr, $capref:expr) => {
206 #[test]
207 fn $name() {
208 assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
209 }
210 };
211 }
212
213 macro_rules! c {
214 ($name_or_number:expr, $pos:expr) => {
215 CaptureRef { cap: $name_or_number.into(), end: $pos }
216 };
217 }
218
219 find!(find_cap_ref1, "$foo", c!("foo", 4));
220 find!(find_cap_ref2, "${foo}", c!("foo", 6));
221 find!(find_cap_ref3, "$0", c!(0, 2));
222 find!(find_cap_ref4, "$5", c!(5, 2));
223 find!(find_cap_ref5, "$10", c!(10, 3));
224 // See https://github.com/rust-lang/regex/pull/585
225 // for more on characters following numbers
226 find!(find_cap_ref6, "$42a", c!("42a", 4));
227 find!(find_cap_ref7, "${42}a", c!(42, 5));
228 find!(find_cap_ref8, "${42");
229 find!(find_cap_ref9, "${42 ");
230 find!(find_cap_ref10, " $0 ");
231 find!(find_cap_ref11, "$");
232 find!(find_cap_ref12, " ");
233 find!(find_cap_ref13, "");
234 find!(find_cap_ref14, "$1-$2", c!(1, 2));
235 find!(find_cap_ref15, "$1_$2", c!("1_", 3));
236 find!(find_cap_ref16, "$x-$y", c!("x", 2));
237 find!(find_cap_ref17, "$x_$y", c!("x_", 3));
238 find!(find_cap_ref18, "${#}", c!("#", 4));
239 find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
240 find!(find_cap_ref20, "${¾}", c!("¾", 5));
241 find!(find_cap_ref21, "${¾a}", c!("¾a", 6));
242 find!(find_cap_ref22, "${a¾}", c!("a¾", 6));
243 find!(find_cap_ref23, "${☃}", c!("☃", 6));
244 find!(find_cap_ref24, "${a☃}", c!("a☃", 7));
245 find!(find_cap_ref25, "${☃a}", c!("☃a", 7));
246 find!(find_cap_ref26, "${名字}", c!("名字", 9));
247}
248