1 | use std::str; |
2 | |
3 | use crate::find_byte::find_byte; |
4 | |
5 | use crate::re_bytes; |
6 | use crate::re_unicode; |
7 | |
8 | pub fn expand_str( |
9 | caps: &re_unicode::Captures<'_>, |
10 | mut replacement: &str, |
11 | dst: &mut String, |
12 | ) { |
13 | while !replacement.is_empty() { |
14 | match find_byte(b'$' , replacement.as_bytes()) { |
15 | None => break, |
16 | Some(i) => { |
17 | dst.push_str(&replacement[..i]); |
18 | replacement = &replacement[i..]; |
19 | } |
20 | } |
21 | if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$' ) { |
22 | dst.push_str("$" ); |
23 | replacement = &replacement[2..]; |
24 | continue; |
25 | } |
26 | debug_assert!(!replacement.is_empty()); |
27 | let cap_ref = match find_cap_ref(replacement.as_bytes()) { |
28 | Some(cap_ref) => cap_ref, |
29 | None => { |
30 | dst.push_str("$" ); |
31 | replacement = &replacement[1..]; |
32 | continue; |
33 | } |
34 | }; |
35 | replacement = &replacement[cap_ref.end..]; |
36 | match cap_ref.cap { |
37 | Ref::Number(i) => { |
38 | dst.push_str(caps.get(i).map(|m| m.as_str()).unwrap_or("" )); |
39 | } |
40 | Ref::Named(name) => { |
41 | dst.push_str( |
42 | caps.name(name).map(|m| m.as_str()).unwrap_or("" ), |
43 | ); |
44 | } |
45 | } |
46 | } |
47 | dst.push_str(replacement); |
48 | } |
49 | |
50 | pub fn expand_bytes( |
51 | caps: &re_bytes::Captures<'_>, |
52 | mut replacement: &[u8], |
53 | dst: &mut Vec<u8>, |
54 | ) { |
55 | while !replacement.is_empty() { |
56 | match find_byte(b'$' , replacement) { |
57 | None => break, |
58 | Some(i) => { |
59 | dst.extend(&replacement[..i]); |
60 | replacement = &replacement[i..]; |
61 | } |
62 | } |
63 | if replacement.get(1).map_or(false, |&b| b == b'$' ) { |
64 | dst.push(b'$' ); |
65 | replacement = &replacement[2..]; |
66 | continue; |
67 | } |
68 | debug_assert!(!replacement.is_empty()); |
69 | let cap_ref = match find_cap_ref(replacement) { |
70 | Some(cap_ref) => cap_ref, |
71 | None => { |
72 | dst.push(b'$' ); |
73 | replacement = &replacement[1..]; |
74 | continue; |
75 | } |
76 | }; |
77 | replacement = &replacement[cap_ref.end..]; |
78 | match cap_ref.cap { |
79 | Ref::Number(i) => { |
80 | dst.extend(caps.get(i).map(|m| m.as_bytes()).unwrap_or(b"" )); |
81 | } |
82 | Ref::Named(name) => { |
83 | dst.extend( |
84 | caps.name(name).map(|m| m.as_bytes()).unwrap_or(b"" ), |
85 | ); |
86 | } |
87 | } |
88 | } |
89 | dst.extend(replacement); |
90 | } |
91 | |
92 | /// `CaptureRef` represents a reference to a capture group inside some text. |
93 | /// The reference is either a capture group name or a number. |
94 | /// |
95 | /// It is also tagged with the position in the text following the |
96 | /// capture reference. |
97 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
98 | struct CaptureRef<'a> { |
99 | cap: Ref<'a>, |
100 | end: usize, |
101 | } |
102 | |
103 | /// A reference to a capture group in some text. |
104 | /// |
105 | /// e.g., `$2`, `$foo`, `${foo}`. |
106 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
107 | enum Ref<'a> { |
108 | Named(&'a str), |
109 | Number(usize), |
110 | } |
111 | |
112 | impl<'a> From<&'a str> for Ref<'a> { |
113 | fn from(x: &'a str) -> Ref<'a> { |
114 | Ref::Named(x) |
115 | } |
116 | } |
117 | |
118 | impl From<usize> for Ref<'static> { |
119 | fn from(x: usize) -> Ref<'static> { |
120 | Ref::Number(x) |
121 | } |
122 | } |
123 | |
124 | /// Parses a possible reference to a capture group name in the given text, |
125 | /// starting at the beginning of `replacement`. |
126 | /// |
127 | /// If no such valid reference could be found, None is returned. |
128 | fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> { |
129 | let mut i = 0; |
130 | let rep: &[u8] = replacement; |
131 | if rep.len() <= 1 || rep[0] != b'$' { |
132 | return None; |
133 | } |
134 | i += 1; |
135 | if rep[i] == b'{' { |
136 | return find_cap_ref_braced(rep, i + 1); |
137 | } |
138 | let mut cap_end = i; |
139 | while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) { |
140 | cap_end += 1; |
141 | } |
142 | if cap_end == i { |
143 | return None; |
144 | } |
145 | // We just verified that the range 0..cap_end is valid ASCII, so it must |
146 | // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 |
147 | // check via an unchecked conversion or by parsing the number straight from |
148 | // &[u8]. |
149 | let cap = |
150 | str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name" ); |
151 | Some(CaptureRef { |
152 | cap: match cap.parse::<u32>() { |
153 | Ok(i) => Ref::Number(i as usize), |
154 | Err(_) => Ref::Named(cap), |
155 | }, |
156 | end: cap_end, |
157 | }) |
158 | } |
159 | |
160 | fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> { |
161 | let start: usize = i; |
162 | while rep.get(i).map_or(default:false, |&b: u8| b != b'}' ) { |
163 | i += 1; |
164 | } |
165 | if !rep.get(i).map_or(default:false, |&b: u8| b == b'}' ) { |
166 | return None; |
167 | } |
168 | // When looking at braced names, we don't put any restrictions on the name, |
169 | // so it's possible it could be invalid UTF-8. But a capture group name |
170 | // can never be invalid UTF-8, so if we have invalid UTF-8, then we can |
171 | // safely return None. |
172 | let cap: &str = match str::from_utf8(&rep[start..i]) { |
173 | Err(_) => return None, |
174 | Ok(cap: &str) => cap, |
175 | }; |
176 | Some(CaptureRef { |
177 | cap: match cap.parse::<u32>() { |
178 | Ok(i: u32) => Ref::Number(i as usize), |
179 | Err(_) => Ref::Named(cap), |
180 | }, |
181 | end: i + 1, |
182 | }) |
183 | } |
184 | |
185 | /// Returns true if and only if the given byte is allowed in a capture name |
186 | /// written in non-brace form. |
187 | fn is_valid_cap_letter(b: u8) -> bool { |
188 | match b { |
189 | b'0' ..=b'9' | b'a' ..=b'z' | b'A' ..=b'Z' | b'_' => true, |
190 | _ => false, |
191 | } |
192 | } |
193 | |
194 | #[cfg (test)] |
195 | mod tests { |
196 | use super::{find_cap_ref, CaptureRef}; |
197 | |
198 | macro_rules! find { |
199 | ($name:ident, $text:expr) => { |
200 | #[test] |
201 | fn $name() { |
202 | assert_eq!(None, find_cap_ref($text.as_bytes())); |
203 | } |
204 | }; |
205 | ($name:ident, $text:expr, $capref:expr) => { |
206 | #[test] |
207 | fn $name() { |
208 | assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); |
209 | } |
210 | }; |
211 | } |
212 | |
213 | macro_rules! c { |
214 | ($name_or_number:expr, $pos:expr) => { |
215 | CaptureRef { cap: $name_or_number.into(), end: $pos } |
216 | }; |
217 | } |
218 | |
219 | find!(find_cap_ref1, "$foo" , c!("foo" , 4)); |
220 | find!(find_cap_ref2, "${foo}" , c!("foo" , 6)); |
221 | find!(find_cap_ref3, "$0" , c!(0, 2)); |
222 | find!(find_cap_ref4, "$5" , c!(5, 2)); |
223 | find!(find_cap_ref5, "$10" , c!(10, 3)); |
224 | // See https://github.com/rust-lang/regex/pull/585 |
225 | // for more on characters following numbers |
226 | find!(find_cap_ref6, "$42a" , c!("42a" , 4)); |
227 | find!(find_cap_ref7, "${42}a" , c!(42, 5)); |
228 | find!(find_cap_ref8, "${42" ); |
229 | find!(find_cap_ref9, "${42 " ); |
230 | find!(find_cap_ref10, " $0 " ); |
231 | find!(find_cap_ref11, "$" ); |
232 | find!(find_cap_ref12, " " ); |
233 | find!(find_cap_ref13, "" ); |
234 | find!(find_cap_ref14, "$1-$2" , c!(1, 2)); |
235 | find!(find_cap_ref15, "$1_$2" , c!("1_" , 3)); |
236 | find!(find_cap_ref16, "$x-$y" , c!("x" , 2)); |
237 | find!(find_cap_ref17, "$x_$y" , c!("x_" , 3)); |
238 | find!(find_cap_ref18, "${#}" , c!("#" , 4)); |
239 | find!(find_cap_ref19, "${Z[}" , c!("Z[" , 5)); |
240 | find!(find_cap_ref20, "${¾}" , c!("¾" , 5)); |
241 | find!(find_cap_ref21, "${¾a}" , c!("¾a" , 6)); |
242 | find!(find_cap_ref22, "${a¾}" , c!("a¾" , 6)); |
243 | find!(find_cap_ref23, "${☃}" , c!("☃" , 6)); |
244 | find!(find_cap_ref24, "${a☃}" , c!("a☃" , 7)); |
245 | find!(find_cap_ref25, "${☃a}" , c!("☃a" , 7)); |
246 | find!(find_cap_ref26, "${名字}" , c!("名字" , 9)); |
247 | } |
248 | |