1 | use crate::{ParseError, err::{perr, ParseErrorKind::*}, parse::{hex_digit_value, check_suffix}}; |
2 | |
3 | |
4 | /// Must start with `\` |
5 | pub(crate) fn unescape<E: Escapee>(input: &str, offset: usize) -> Result<(E, usize), ParseError> { |
6 | let first = input.as_bytes().get(1) |
7 | .ok_or(perr(offset, UnterminatedEscape))?; |
8 | let out = match first { |
9 | // Quote escapes |
10 | b' \'' => (E::from_byte(b' \'' ), 2), |
11 | b'"' => (E::from_byte(b'"' ), 2), |
12 | |
13 | // Ascii escapes |
14 | b'n' => (E::from_byte(b' \n' ), 2), |
15 | b'r' => (E::from_byte(b' \r' ), 2), |
16 | b't' => (E::from_byte(b' \t' ), 2), |
17 | b' \\' => (E::from_byte(b' \\' ), 2), |
18 | b'0' => (E::from_byte(b' \0' ), 2), |
19 | b'x' => { |
20 | let hex_string = input.get(2..4) |
21 | .ok_or(perr(offset..offset + input.len(), UnterminatedEscape))? |
22 | .as_bytes(); |
23 | let first = hex_digit_value(hex_string[0]) |
24 | .ok_or(perr(offset..offset + 4, InvalidXEscape))?; |
25 | let second = hex_digit_value(hex_string[1]) |
26 | .ok_or(perr(offset..offset + 4, InvalidXEscape))?; |
27 | let value = second + 16 * first; |
28 | |
29 | if E::SUPPORTS_UNICODE && value > 0x7F { |
30 | return Err(perr(offset..offset + 4, NonAsciiXEscape)); |
31 | } |
32 | |
33 | (E::from_byte(value), 4) |
34 | }, |
35 | |
36 | // Unicode escape |
37 | b'u' => { |
38 | if !E::SUPPORTS_UNICODE { |
39 | return Err(perr(offset..offset + 2, UnicodeEscapeInByteLiteral)); |
40 | } |
41 | |
42 | if input.as_bytes().get(2) != Some(&b'{' ) { |
43 | return Err(perr(offset..offset + 2, UnicodeEscapeWithoutBrace)); |
44 | } |
45 | |
46 | let closing_pos = input.bytes().position(|b| b == b'}' ) |
47 | .ok_or(perr(offset..offset + input.len(), UnterminatedUnicodeEscape))?; |
48 | |
49 | let inner = &input[3..closing_pos]; |
50 | if inner.as_bytes().first() == Some(&b'_' ) { |
51 | return Err(perr(4, InvalidStartOfUnicodeEscape)); |
52 | } |
53 | |
54 | let mut v: u32 = 0; |
55 | let mut digit_count = 0; |
56 | for (i, b) in inner.bytes().enumerate() { |
57 | if b == b'_' { |
58 | continue; |
59 | } |
60 | |
61 | let digit = hex_digit_value(b) |
62 | .ok_or(perr(offset + 3 + i, NonHexDigitInUnicodeEscape))?; |
63 | |
64 | if digit_count == 6 { |
65 | return Err(perr(offset + 3 + i, TooManyDigitInUnicodeEscape)); |
66 | } |
67 | digit_count += 1; |
68 | v = 16 * v + digit as u32; |
69 | } |
70 | |
71 | let c = std::char::from_u32(v) |
72 | .ok_or(perr(offset..closing_pos + 1, InvalidUnicodeEscapeChar))?; |
73 | |
74 | (E::from_char(c), closing_pos + 1) |
75 | } |
76 | |
77 | _ => return Err(perr(offset..offset + 2, UnknownEscape)), |
78 | }; |
79 | |
80 | Ok(out) |
81 | } |
82 | |
83 | pub(crate) trait Escapee: Into<char> { |
84 | const SUPPORTS_UNICODE: bool; |
85 | fn from_byte(b: u8) -> Self; |
86 | fn from_char(c: char) -> Self; |
87 | } |
88 | |
89 | impl Escapee for u8 { |
90 | const SUPPORTS_UNICODE: bool = false; |
91 | fn from_byte(b: u8) -> Self { |
92 | b |
93 | } |
94 | fn from_char(_: char) -> Self { |
95 | panic!("bug: `<u8 as Escapee>::from_char` was called" ); |
96 | } |
97 | } |
98 | |
99 | impl Escapee for char { |
100 | const SUPPORTS_UNICODE: bool = true; |
101 | fn from_byte(b: u8) -> Self { |
102 | b.into() |
103 | } |
104 | fn from_char(c: char) -> Self { |
105 | c |
106 | } |
107 | } |
108 | |
109 | /// Checks whether the character is skipped after a string continue start |
110 | /// (unescaped backlash followed by `\n`). |
111 | fn is_string_continue_skipable_whitespace(b: u8) -> bool { |
112 | b == b' ' || b == b' \t' || b == b' \n' || b == b' \r' |
113 | } |
114 | |
115 | /// Unescapes a whole string or byte string. |
116 | #[inline (never)] |
117 | pub(crate) fn unescape_string<E: Escapee>( |
118 | input: &str, |
119 | offset: usize, |
120 | ) -> Result<(Option<String>, usize), ParseError> { |
121 | let mut closing_quote_pos = None; |
122 | let mut i = offset; |
123 | let mut end_last_escape = offset; |
124 | let mut value = String::new(); |
125 | while i < input.len() { |
126 | match input.as_bytes()[i] { |
127 | // Handle "string continue". |
128 | b' \\' if input.as_bytes().get(i + 1) == Some(&b' \n' ) => { |
129 | value.push_str(&input[end_last_escape..i]); |
130 | |
131 | // Find the first non-whitespace character. |
132 | let end_escape = input[i + 2..].bytes() |
133 | .position(|b| !is_string_continue_skipable_whitespace(b)) |
134 | .ok_or(perr(None, UnterminatedString))?; |
135 | |
136 | i += 2 + end_escape; |
137 | end_last_escape = i; |
138 | } |
139 | b' \\' => { |
140 | let (c, len) = unescape::<E>(&input[i..input.len() - 1], i)?; |
141 | value.push_str(&input[end_last_escape..i]); |
142 | value.push(c.into()); |
143 | i += len; |
144 | end_last_escape = i; |
145 | } |
146 | b' \r' => { |
147 | if input.as_bytes().get(i + 1) == Some(&b' \n' ) { |
148 | value.push_str(&input[end_last_escape..i]); |
149 | value.push(' \n' ); |
150 | i += 2; |
151 | end_last_escape = i; |
152 | } else { |
153 | return Err(perr(i, IsolatedCr)) |
154 | } |
155 | } |
156 | b'"' => { |
157 | closing_quote_pos = Some(i); |
158 | break; |
159 | }, |
160 | b if !E::SUPPORTS_UNICODE && !b.is_ascii() |
161 | => return Err(perr(i, NonAsciiInByteLiteral)), |
162 | _ => i += 1, |
163 | } |
164 | } |
165 | |
166 | let closing_quote_pos = closing_quote_pos.ok_or(perr(None, UnterminatedString))?; |
167 | |
168 | let start_suffix = closing_quote_pos + 1; |
169 | let suffix = &input[start_suffix..]; |
170 | check_suffix(suffix).map_err(|kind| perr(start_suffix, kind))?; |
171 | |
172 | // `value` is only empty if there was no escape in the input string |
173 | // (with the special case of the input being empty). This means the |
174 | // string value basically equals the input, so we store `None`. |
175 | let value = if value.is_empty() { |
176 | None |
177 | } else { |
178 | // There was an escape in the string, so we need to push the |
179 | // remaining unescaped part of the string still. |
180 | value.push_str(&input[end_last_escape..closing_quote_pos]); |
181 | Some(value) |
182 | }; |
183 | |
184 | Ok((value, start_suffix)) |
185 | } |
186 | |
187 | /// Reads and checks a raw (byte) string literal, converting `\r\n` sequences to |
188 | /// just `\n` sequences. Returns an optional new string (if the input contained |
189 | /// any `\r\n`) and the number of hashes used by the literal. |
190 | #[inline (never)] |
191 | pub(crate) fn scan_raw_string<E: Escapee>( |
192 | input: &str, |
193 | offset: usize, |
194 | ) -> Result<(Option<String>, u32, usize), ParseError> { |
195 | // Raw string literal |
196 | let num_hashes = input[offset..].bytes().position(|b| b != b'#' ) |
197 | .ok_or(perr(None, InvalidLiteral))?; |
198 | |
199 | if input.as_bytes().get(offset + num_hashes) != Some(&b'"' ) { |
200 | return Err(perr(None, InvalidLiteral)); |
201 | } |
202 | let start_inner = offset + num_hashes + 1; |
203 | let hashes = &input[offset..num_hashes + offset]; |
204 | |
205 | let mut closing_quote_pos = None; |
206 | let mut i = start_inner; |
207 | let mut end_last_escape = start_inner; |
208 | let mut value = String::new(); |
209 | while i < input.len() { |
210 | let b = input.as_bytes()[i]; |
211 | if b == b'"' && input[i + 1..].starts_with(hashes) { |
212 | closing_quote_pos = Some(i); |
213 | break; |
214 | } |
215 | |
216 | if b == b' \r' { |
217 | // Convert `\r\n` into `\n`. This is currently not well documented |
218 | // in the Rust reference, but is done even for raw strings. That's |
219 | // because rustc simply converts all line endings when reading |
220 | // source files. |
221 | if input.as_bytes().get(i + 1) == Some(&b' \n' ) { |
222 | value.push_str(&input[end_last_escape..i]); |
223 | value.push(' \n' ); |
224 | i += 2; |
225 | end_last_escape = i; |
226 | continue; |
227 | } else if E::SUPPORTS_UNICODE { |
228 | // If no \n follows the \r and we are scanning a raw string |
229 | // (not raw byte string), we error. |
230 | return Err(perr(i, IsolatedCr)) |
231 | } |
232 | } |
233 | |
234 | if !E::SUPPORTS_UNICODE { |
235 | if !b.is_ascii() { |
236 | return Err(perr(i, NonAsciiInByteLiteral)); |
237 | } |
238 | } |
239 | |
240 | i += 1; |
241 | } |
242 | |
243 | let closing_quote_pos = closing_quote_pos.ok_or(perr(None, UnterminatedRawString))?; |
244 | |
245 | let start_suffix = closing_quote_pos + num_hashes + 1; |
246 | let suffix = &input[start_suffix..]; |
247 | check_suffix(suffix).map_err(|kind| perr(start_suffix, kind))?; |
248 | |
249 | // `value` is only empty if there was no \r\n in the input string (with the |
250 | // special case of the input being empty). This means the string value |
251 | // equals the input, so we store `None`. |
252 | let value = if value.is_empty() { |
253 | None |
254 | } else { |
255 | // There was an \r\n in the string, so we need to push the remaining |
256 | // unescaped part of the string still. |
257 | value.push_str(&input[end_last_escape..closing_quote_pos]); |
258 | Some(value) |
259 | }; |
260 | |
261 | Ok((value, num_hashes as u32, start_suffix)) |
262 | } |
263 | |