1 | use std::borrow::Cow; |
2 | use std::char; |
3 | use std::ops::RangeInclusive; |
4 | |
5 | use winnow::combinator::alt; |
6 | use winnow::combinator::cut_err; |
7 | use winnow::combinator::delimited; |
8 | use winnow::combinator::fail; |
9 | use winnow::combinator::opt; |
10 | use winnow::combinator::peek; |
11 | use winnow::combinator::preceded; |
12 | use winnow::combinator::repeat; |
13 | use winnow::combinator::success; |
14 | use winnow::combinator::terminated; |
15 | use winnow::prelude::*; |
16 | use winnow::token::any; |
17 | use winnow::token::none_of; |
18 | use winnow::token::one_of; |
19 | use winnow::token::tag; |
20 | use winnow::token::take_while; |
21 | |
22 | use crate::parser::errors::CustomError; |
23 | use crate::parser::numbers::HEXDIG; |
24 | use crate::parser::prelude::*; |
25 | use crate::parser::trivia::{from_utf8_unchecked, newline, ws, ws_newlines, NON_ASCII, WSCHAR}; |
26 | |
27 | // ;; String |
28 | |
29 | // string = ml-basic-string / basic-string / ml-literal-string / literal-string |
30 | pub(crate) fn string(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> { |
31 | altimpl Parser, …>(( |
32 | ml_basic_string, |
33 | basic_string, |
34 | ml_literal_string, |
35 | literal_string.map(Cow::Borrowed), |
36 | )) |
37 | .parse_next(input) |
38 | } |
39 | |
40 | // ;; Basic String |
41 | |
42 | // basic-string = quotation-mark *basic-char quotation-mark |
43 | pub(crate) fn basic_string(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> { |
44 | let (mut input: Located<&BStr>, _) = one_of(QUOTATION_MARK).parse_next(input)?; |
45 | |
46 | let mut c: Cow<'_, str> = Cow::Borrowed("" ); |
47 | if let Some((i: Located<&BStr>, ci: Cow<'_, str>)) = ok_error(res:basic_chars.parse_next(input))? { |
48 | input = i; |
49 | c = ci; |
50 | } |
51 | while let Some((i: Located<&BStr>, ci: Cow<'_, str>)) = ok_error(res:basic_chars.parse_next(input))? { |
52 | input = i; |
53 | c.to_mut().push_str(&ci); |
54 | } |
55 | |
56 | let (input: Located<&BStr>, _) = cut_errContext, …>, …, …, …, …>(parser:one_of(QUOTATION_MARK)) |
57 | .context(Context::Expression("basic string" )) |
58 | .parse_next(input)?; |
59 | |
60 | Ok((input, c)) |
61 | } |
62 | |
63 | // quotation-mark = %x22 ; " |
64 | pub(crate) const QUOTATION_MARK: u8 = b'"' ; |
65 | |
66 | // basic-char = basic-unescaped / escaped |
67 | fn basic_chars(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> { |
68 | altimpl Parser, …>(( |
69 | // Deviate from the official grammar by batching the unescaped chars so we build a string a |
70 | // chunk at a time, rather than a `char` at a time. |
71 | take_whileTryMap, …>, …, …, …, …, …, …>(range:1.., BASIC_UNESCAPED) |
72 | .try_map(std::str::from_utf8) |
73 | .map(Cow::Borrowed), |
74 | escaped.map(|c: char| Cow::Owned(String::from(c))), |
75 | )) |
76 | .parse_next(input) |
77 | } |
78 | |
79 | // basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii |
80 | pub(crate) const BASIC_UNESCAPED: ( |
81 | (u8, u8), |
82 | u8, |
83 | RangeInclusive<u8>, |
84 | RangeInclusive<u8>, |
85 | RangeInclusive<u8>, |
86 | ) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII); |
87 | |
88 | // escaped = escape escape-seq-char |
89 | fn escaped(input: Input<'_>) -> IResult<Input<'_>, char, ParserError<'_>> { |
90 | preceded(ESCAPE, second:escape_seq_char).parse_next(input) |
91 | } |
92 | |
93 | // escape = %x5C ; \ |
94 | pub(crate) const ESCAPE: u8 = b' \\' ; |
95 | |
96 | // escape-seq-char = %x22 ; " quotation mark U+0022 |
97 | // escape-seq-char =/ %x5C ; \ reverse solidus U+005C |
98 | // escape-seq-char =/ %x62 ; b backspace U+0008 |
99 | // escape-seq-char =/ %x66 ; f form feed U+000C |
100 | // escape-seq-char =/ %x6E ; n line feed U+000A |
101 | // escape-seq-char =/ %x72 ; r carriage return U+000D |
102 | // escape-seq-char =/ %x74 ; t tab U+0009 |
103 | // escape-seq-char =/ %x75 4HEXDIG ; uXXXX U+XXXX |
104 | // escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX U+XXXXXXXX |
105 | fn escape_seq_char(input: Input<'_>) -> IResult<Input<'_>, char, ParserError<'_>> { |
106 | dispatch! {any; |
107 | b'b' => success(' \u{8}' ), |
108 | b'f' => success(' \u{c}' ), |
109 | b'n' => success(' \n' ), |
110 | b'r' => success(' \r' ), |
111 | b't' => success(' \t' ), |
112 | b'u' => cut_err(hexescape::<4>).context(Context::Expression("unicode 4-digit hex code" )), |
113 | b'U' => cut_err(hexescape::<8>).context(Context::Expression("unicode 8-digit hex code" )), |
114 | b' \\' => success(' \\' ), |
115 | b'"' => success('"' ), |
116 | _ => { |
117 | cut_err(fail::<_, char, _>) |
118 | .context(Context::Expression("escape sequence" )) |
119 | .context(Context::Expected(ParserValue::CharLiteral('b' ))) |
120 | .context(Context::Expected(ParserValue::CharLiteral('f' ))) |
121 | .context(Context::Expected(ParserValue::CharLiteral('n' ))) |
122 | .context(Context::Expected(ParserValue::CharLiteral('r' ))) |
123 | .context(Context::Expected(ParserValue::CharLiteral('t' ))) |
124 | .context(Context::Expected(ParserValue::CharLiteral('u' ))) |
125 | .context(Context::Expected(ParserValue::CharLiteral('U' ))) |
126 | .context(Context::Expected(ParserValue::CharLiteral(' \\' ))) |
127 | .context(Context::Expected(ParserValue::CharLiteral('"' ))) |
128 | } |
129 | } |
130 | .parse_next(input) |
131 | } |
132 | |
133 | pub(crate) fn hexescape<const N: usize>( |
134 | input: Input<'_>, |
135 | ) -> IResult<Input<'_>, char, ParserError<'_>> { |
136 | take_whileTryMap, …, …, …, …, …>, …, …, …, …, …>, …, …, …, …, …, …>(0..=N, HEXDIG) |
137 | .verify(|b: &[u8]| b.len() == N) |
138 | .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`is_ascii_digit` filters out on-ASCII" ) }) |
139 | .verify_map(|s: &str| u32::from_str_radix(src:s, radix:16).ok()) |
140 | .try_map(|h: u32| char::from_u32(h).ok_or(err:CustomError::OutOfRange)) |
141 | .parse_next(input) |
142 | } |
143 | |
144 | // ;; Multiline Basic String |
145 | |
146 | // ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body |
147 | // ml-basic-string-delim |
148 | fn ml_basic_string(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> { |
149 | delimitedContext, …>, …, …, …, …>( |
150 | ML_BASIC_STRING_DELIM, |
151 | second:preceded(opt(newline), cut_err(ml_basic_body)), |
152 | third:cut_err(ML_BASIC_STRING_DELIM), |
153 | ) |
154 | .context(Context::Expression("multiline basic string" )) |
155 | .parse_next(input) |
156 | } |
157 | |
158 | // ml-basic-string-delim = 3quotation-mark |
159 | pub(crate) const ML_BASIC_STRING_DELIM: &[u8] = b" \"\"\"" ; |
160 | |
161 | // ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ] |
162 | fn ml_basic_body(mut input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> { |
163 | let mut c = Cow::Borrowed("" ); |
164 | if let Some((i, ci)) = ok_error(mlb_content.parse_next(input))? { |
165 | input = i; |
166 | c = ci; |
167 | } |
168 | while let Some((i, ci)) = ok_error(mlb_content.parse_next(input))? { |
169 | input = i; |
170 | c.to_mut().push_str(&ci); |
171 | } |
172 | |
173 | while let Some((i, qi)) = ok_error(mlb_quotes(none_of(b' \"' ).value(())).parse_next(input))? { |
174 | if let Some((i, ci)) = ok_error(mlb_content.parse_next(i))? { |
175 | input = i; |
176 | c.to_mut().push_str(qi); |
177 | c.to_mut().push_str(&ci); |
178 | while let Some((i, ci)) = ok_error(mlb_content.parse_next(input))? { |
179 | input = i; |
180 | c.to_mut().push_str(&ci); |
181 | } |
182 | } else { |
183 | break; |
184 | } |
185 | } |
186 | |
187 | if let Some((i, qi)) = |
188 | ok_error(mlb_quotes(tag(ML_BASIC_STRING_DELIM).value(())).parse_next(input))? |
189 | { |
190 | input = i; |
191 | c.to_mut().push_str(qi); |
192 | } |
193 | |
194 | Ok((input, c)) |
195 | } |
196 | |
197 | // mlb-content = mlb-char / newline / mlb-escaped-nl |
198 | // mlb-char = mlb-unescaped / escaped |
199 | fn mlb_content(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> { |
200 | altimpl Parser, …>(( |
201 | // Deviate from the official grammar by batching the unescaped chars so we build a string a |
202 | // chunk at a time, rather than a `char` at a time. |
203 | take_whileTryMap, …>, …, …, …, …, …, …>(range:1.., MLB_UNESCAPED) |
204 | .try_map(std::str::from_utf8) |
205 | .map(Cow::Borrowed), |
206 | // Order changed fromg grammar so `escaped` can more easily `cut_err` on bad escape sequences |
207 | mlb_escaped_nl.map(|_| Cow::Borrowed("" )), |
208 | escaped.map(|c: char| Cow::Owned(String::from(c))), |
209 | newline.map(|_| Cow::Borrowed(" \n" )), |
210 | )) |
211 | .parse_next(input) |
212 | } |
213 | |
214 | // mlb-quotes = 1*2quotation-mark |
215 | fn mlb_quotes<'i>( |
216 | mut term: impl winnow::Parser<Input<'i>, (), ParserError<'i>>, |
217 | ) -> impl FnMut(Input<'i>) -> IResult<Input<'i>, &str, ParserError<'i>> { |
218 | move |input: Located<&BStr>| { |
219 | let res: Result<(Located<&BStr>, &…), …> = terminatedMap, …>, …, …, …, …, …>(first:b" \"\"" , second:peek(term.by_ref())) |
220 | .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII" ) }) |
221 | .parse_next(input); |
222 | |
223 | match res { |
224 | Err(winnow::error::ErrMode::Backtrack(_)) => terminatedMap, …>, …, …, …, …, …>(first:b" \"" , second:peek(term.by_ref())) |
225 | .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII" ) }) |
226 | .parse_next(input), |
227 | res: Result<(Located<&BStr>, &…), …> => res, |
228 | } |
229 | } |
230 | } |
231 | |
232 | // mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii |
233 | pub(crate) const MLB_UNESCAPED: ( |
234 | (u8, u8), |
235 | u8, |
236 | RangeInclusive<u8>, |
237 | RangeInclusive<u8>, |
238 | RangeInclusive<u8>, |
239 | ) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII); |
240 | |
241 | // mlb-escaped-nl = escape ws newline *( wschar / newline |
242 | // When the last non-whitespace character on a line is a \, |
243 | // it will be trimmed along with all whitespace |
244 | // (including newlines) up to the next non-whitespace |
245 | // character or closing delimiter. |
246 | fn mlb_escaped_nl(input: Input<'_>) -> IResult<Input<'_>, (), ParserError<'_>> { |
247 | repeatValue(1.., (ESCAPE, ws, ws_newlines)) |
248 | .map(|()| ()) |
249 | .value(()) |
250 | .parse_next(input) |
251 | } |
252 | |
253 | // ;; Literal String |
254 | |
255 | // literal-string = apostrophe *literal-char apostrophe |
256 | pub(crate) fn literal_string(input: Input<'_>) -> IResult<Input<'_>, &str, ParserError<'_>> { |
257 | delimitedContext, …, …, …, …, …, …>, …, …, …, …>( |
258 | APOSTROPHE, |
259 | second:cut_err(take_while(0.., LITERAL_CHAR)), |
260 | third:cut_err(APOSTROPHE), |
261 | ) |
262 | .try_map(std::str::from_utf8) |
263 | .context(Context::Expression("literal string" )) |
264 | .parse_next(input) |
265 | } |
266 | |
267 | // apostrophe = %x27 ; ' apostrophe |
268 | pub(crate) const APOSTROPHE: u8 = b' \'' ; |
269 | |
270 | // literal-char = %x09 / %x20-26 / %x28-7E / non-ascii |
271 | pub(crate) const LITERAL_CHAR: ( |
272 | u8, |
273 | RangeInclusive<u8>, |
274 | RangeInclusive<u8>, |
275 | RangeInclusive<u8>, |
276 | ) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII); |
277 | |
278 | // ;; Multiline Literal String |
279 | |
280 | // ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body |
281 | // ml-literal-string-delim |
282 | fn ml_literal_string(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> { |
283 | delimitedContext, …>, …, …, …, …>( |
284 | (ML_LITERAL_STRING_DELIM, opt(newline)), |
285 | second:cut_err(ml_literal_body.map(|t| { |
286 | if t.contains(" \r\n" ) { |
287 | Cow::Owned(t.replace(" \r\n" , " \n" )) |
288 | } else { |
289 | Cow::Borrowed(t) |
290 | } |
291 | })), |
292 | third:cut_err(ML_LITERAL_STRING_DELIM), |
293 | ) |
294 | .context(Context::Expression("multiline literal string" )) |
295 | .parse_next(input) |
296 | } |
297 | |
298 | // ml-literal-string-delim = 3apostrophe |
299 | pub(crate) const ML_LITERAL_STRING_DELIM: &[u8] = b"'''" ; |
300 | |
301 | // ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ] |
302 | fn ml_literal_body(input: Input<'_>) -> IResult<Input<'_>, &str, ParserError<'_>> { |
303 | ( |
304 | repeat(range:0.., f:mll_content).map(|()| ()), |
305 | repeatimpl Parser, …>( |
306 | range:0.., |
307 | ( |
308 | mll_quotes(term:none_of(APOSTROPHE).value(())), |
309 | repeat(range:1.., f:mll_content).map(|()| ()), |
310 | ), |
311 | ) |
312 | .map(|()| ()), |
313 | opt(mll_quotes(term:tag(ML_LITERAL_STRING_DELIM).value(()))), |
314 | ) |
315 | .recognize() |
316 | .try_map(std::str::from_utf8) |
317 | .parse_next(input) |
318 | } |
319 | |
320 | // mll-content = mll-char / newline |
321 | fn mll_content(input: Input<'_>) -> IResult<Input<'_>, u8, ParserError<'_>> { |
322 | alt((one_of(MLL_CHAR), newline)).parse_next(input) |
323 | } |
324 | |
325 | // mll-char = %x09 / %x20-26 / %x28-7E / non-ascii |
326 | const MLL_CHAR: ( |
327 | u8, |
328 | RangeInclusive<u8>, |
329 | RangeInclusive<u8>, |
330 | RangeInclusive<u8>, |
331 | ) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII); |
332 | |
333 | // mll-quotes = 1*2apostrophe |
334 | fn mll_quotes<'i>( |
335 | mut term: impl winnow::Parser<Input<'i>, (), ParserError<'i>>, |
336 | ) -> impl FnMut(Input<'i>) -> IResult<Input<'i>, &str, ParserError<'i>> { |
337 | move |input: Located<&BStr>| { |
338 | let res: Result<(Located<&BStr>, &…), …> = terminatedMap, …>, …, …, …, …, …>(first:b"''" , second:peek(term.by_ref())) |
339 | .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII" ) }) |
340 | .parse_next(input); |
341 | |
342 | match res { |
343 | Err(winnow::error::ErrMode::Backtrack(_)) => terminatedMap, …>, …, …, …, …, …>(first:b"'" , second:peek(term.by_ref())) |
344 | .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII" ) }) |
345 | .parse_next(input), |
346 | res: Result<(Located<&BStr>, &…), …> => res, |
347 | } |
348 | } |
349 | } |
350 | |
351 | #[cfg (test)] |
352 | mod test { |
353 | use super::*; |
354 | |
355 | #[test ] |
356 | fn basic_string() { |
357 | let input = |
358 | r#""I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF. \U0002070E""# ; |
359 | let expected = "I \'m a string. \"You can quote me \". Name \tJosé \nLocation \tSF. \u{2070E}" ; |
360 | let parsed = string.parse(new_input(input)); |
361 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
362 | } |
363 | |
364 | #[test ] |
365 | fn ml_basic_string() { |
366 | let cases = [ |
367 | ( |
368 | r#"""" |
369 | Roses are red |
370 | Violets are blue""""# , |
371 | r#"Roses are red |
372 | Violets are blue"# , |
373 | ), |
374 | (r#"""" \""" """"# , " \"\"\" " ), |
375 | (r#"""" \\""""# , " \\" ), |
376 | ]; |
377 | |
378 | for &(input, expected) in &cases { |
379 | let parsed = string.parse(new_input(input)); |
380 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
381 | } |
382 | |
383 | let invalid_cases = [r#"""" """# , r#"""" \""""# ]; |
384 | |
385 | for input in &invalid_cases { |
386 | let parsed = string.parse(new_input(input)); |
387 | assert!(parsed.is_err()); |
388 | } |
389 | } |
390 | |
391 | #[test ] |
392 | fn ml_basic_string_escape_ws() { |
393 | let inputs = [ |
394 | r#"""" |
395 | The quick brown \ |
396 | |
397 | |
398 | fox jumps over \ |
399 | the lazy dog.""""# , |
400 | r#""""\ |
401 | The quick brown \ |
402 | fox jumps over \ |
403 | the lazy dog.\ |
404 | """"# , |
405 | ]; |
406 | for input in &inputs { |
407 | let expected = "The quick brown fox jumps over the lazy dog." ; |
408 | let parsed = string.parse(new_input(input)); |
409 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
410 | } |
411 | let empties = [ |
412 | r#""""\ |
413 | """"# , |
414 | r#"""" |
415 | \ |
416 | \ |
417 | """"# , |
418 | ]; |
419 | for input in &empties { |
420 | let expected = "" ; |
421 | let parsed = string.parse(new_input(input)); |
422 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
423 | } |
424 | } |
425 | |
426 | #[test ] |
427 | fn literal_string() { |
428 | let inputs = [ |
429 | r#"'C:\Users\nodejs\templates'"# , |
430 | r#"'\\ServerX\admin$\system32\'"# , |
431 | r#"'Tom "Dubs" Preston-Werner'"# , |
432 | r#"'<\i\c*\s*>'"# , |
433 | ]; |
434 | |
435 | for input in &inputs { |
436 | let expected = &input[1..input.len() - 1]; |
437 | let parsed = string.parse(new_input(input)); |
438 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
439 | } |
440 | } |
441 | |
442 | #[test ] |
443 | fn ml_literal_string() { |
444 | let inputs = [ |
445 | r#"'''I [dw]on't need \d{2} apples'''"# , |
446 | r#"''''one_quote''''"# , |
447 | ]; |
448 | for input in &inputs { |
449 | let expected = &input[3..input.len() - 3]; |
450 | let parsed = string.parse(new_input(input)); |
451 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
452 | } |
453 | |
454 | let input = r#"''' |
455 | The first newline is |
456 | trimmed in raw strings. |
457 | All other whitespace |
458 | is preserved. |
459 | '''"# ; |
460 | let expected = &input[4..input.len() - 3]; |
461 | let parsed = string.parse(new_input(input)); |
462 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
463 | } |
464 | } |
465 | |