1 | use std::borrow::Cow; |
2 | use std::char; |
3 | use std::ops::RangeInclusive; |
4 | |
5 | use winnow::combinator::alt; |
6 | use winnow::combinator::cut_err; |
7 | use winnow::combinator::delimited; |
8 | use winnow::combinator::empty; |
9 | use winnow::combinator::fail; |
10 | use winnow::combinator::opt; |
11 | use winnow::combinator::peek; |
12 | use winnow::combinator::preceded; |
13 | use winnow::combinator::repeat; |
14 | use winnow::combinator::terminated; |
15 | use winnow::combinator::trace; |
16 | use winnow::prelude::*; |
17 | use winnow::stream::Stream; |
18 | use winnow::token::any; |
19 | use winnow::token::none_of; |
20 | use winnow::token::one_of; |
21 | use winnow::token::take_while; |
22 | |
23 | use crate::parser::error::CustomError; |
24 | use crate::parser::numbers::HEXDIG; |
25 | use crate::parser::prelude::*; |
26 | use crate::parser::trivia::{from_utf8_unchecked, newline, ws, ws_newlines, NON_ASCII, WSCHAR}; |
27 | |
28 | // ;; String |
29 | |
30 | // string = ml-basic-string / basic-string / ml-literal-string / literal-string |
31 | pub(crate) fn string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
32 | traceimpl Parser, …>( |
33 | name:"string" , |
34 | parser:alt(( |
35 | ml_basic_string, |
36 | basic_string, |
37 | ml_literal_string, |
38 | literal_string.map(Cow::Borrowed), |
39 | )), |
40 | ) |
41 | .parse_next(input) |
42 | } |
43 | |
44 | // ;; Basic String |
45 | |
46 | // basic-string = quotation-mark *basic-char quotation-mark |
47 | pub(crate) fn basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
48 | traceimpl Parser, …>(name:"basic-string" , |input: &mut Input<'i>| { |
49 | let _ = one_of(QUOTATION_MARK).parse_next(input)?; |
50 | |
51 | let mut c: Cow<'_, str> = Cow::Borrowed("" ); |
52 | if let Some(ci: Cow<'_, str>) = opt(parser:basic_chars).parse_next(input)? { |
53 | c = ci; |
54 | } |
55 | while let Some(ci: Cow<'_, str>) = opt(parser:basic_chars).parse_next(input)? { |
56 | c.to_mut().push_str(&ci); |
57 | } |
58 | |
59 | let _ = cut_errContext, …>, …, …, …, …>(parser:one_of(QUOTATION_MARK)) |
60 | .context(StrContext::Label("basic string" )) |
61 | .parse_next(input)?; |
62 | |
63 | Ok(c) |
64 | }) |
65 | .parse_next(input) |
66 | } |
67 | |
68 | // quotation-mark = %x22 ; " |
69 | pub(crate) const QUOTATION_MARK: u8 = b'"' ; |
70 | |
71 | // basic-char = basic-unescaped / escaped |
72 | fn basic_chars<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
73 | altimpl Parser, …>(( |
74 | // Deviate from the official grammar by batching the unescaped chars so we build a string a |
75 | // chunk at a time, rather than a `char` at a time. |
76 | take_whileTryMap, …>, …, …, …, …, …, …>(occurrences:1.., BASIC_UNESCAPED) |
77 | .try_map(std::str::from_utf8) |
78 | .map(Cow::Borrowed), |
79 | escaped.map(|c: char| Cow::Owned(String::from(c))), |
80 | )) |
81 | .parse_next(input) |
82 | } |
83 | |
84 | // basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii |
85 | pub(crate) const BASIC_UNESCAPED: ( |
86 | (u8, u8), |
87 | u8, |
88 | RangeInclusive<u8>, |
89 | RangeInclusive<u8>, |
90 | RangeInclusive<u8>, |
91 | ) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII); |
92 | |
93 | // escaped = escape escape-seq-char |
94 | fn escaped(input: &mut Input<'_>) -> PResult<char> { |
95 | preceded(ESCAPE, parser:escape_seq_char).parse_next(input) |
96 | } |
97 | |
98 | // escape = %x5C ; \ |
99 | pub(crate) const ESCAPE: u8 = b' \\' ; |
100 | |
101 | // escape-seq-char = %x22 ; " quotation mark U+0022 |
102 | // escape-seq-char =/ %x5C ; \ reverse solidus U+005C |
103 | // escape-seq-char =/ %x62 ; b backspace U+0008 |
104 | // escape-seq-char =/ %x66 ; f form feed U+000C |
105 | // escape-seq-char =/ %x6E ; n line feed U+000A |
106 | // escape-seq-char =/ %x72 ; r carriage return U+000D |
107 | // escape-seq-char =/ %x74 ; t tab U+0009 |
108 | // escape-seq-char =/ %x75 4HEXDIG ; uXXXX U+XXXX |
109 | // escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX U+XXXXXXXX |
110 | fn escape_seq_char(input: &mut Input<'_>) -> PResult<char> { |
111 | dispatch! {any; |
112 | b'b' => empty.value(' \u{8}' ), |
113 | b'f' => empty.value(' \u{c}' ), |
114 | b'n' => empty.value(' \n' ), |
115 | b'r' => empty.value(' \r' ), |
116 | b't' => empty.value(' \t' ), |
117 | b'u' => cut_err(hexescape::<4>).context(StrContext::Label("unicode 4-digit hex code" )), |
118 | b'U' => cut_err(hexescape::<8>).context(StrContext::Label("unicode 8-digit hex code" )), |
119 | b' \\' => empty.value(' \\' ), |
120 | b'"' => empty.value('"' ), |
121 | _ => { |
122 | cut_err(fail::<_, char, _>) |
123 | .context(StrContext::Label("escape sequence" )) |
124 | .context(StrContext::Expected(StrContextValue::CharLiteral('b' ))) |
125 | .context(StrContext::Expected(StrContextValue::CharLiteral('f' ))) |
126 | .context(StrContext::Expected(StrContextValue::CharLiteral('n' ))) |
127 | .context(StrContext::Expected(StrContextValue::CharLiteral('r' ))) |
128 | .context(StrContext::Expected(StrContextValue::CharLiteral('t' ))) |
129 | .context(StrContext::Expected(StrContextValue::CharLiteral('u' ))) |
130 | .context(StrContext::Expected(StrContextValue::CharLiteral('U' ))) |
131 | .context(StrContext::Expected(StrContextValue::CharLiteral(' \\' ))) |
132 | .context(StrContext::Expected(StrContextValue::CharLiteral('"' ))) |
133 | } |
134 | } |
135 | .parse_next(input) |
136 | } |
137 | |
138 | pub(crate) fn hexescape<const N: usize>(input: &mut Input<'_>) -> PResult<char> { |
139 | take_whileTryMap, …, …, …, …, …>, …, …, …, …, …>, …, …, …, …, …, …>(0..=N, HEXDIG) |
140 | .verify(|b: &[u8]| b.len() == N) |
141 | .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`is_ascii_digit` filters out on-ASCII" ) }) |
142 | .verify_map(|s: &str| u32::from_str_radix(src:s, radix:16).ok()) |
143 | .try_map(|h: u32| char::from_u32(h).ok_or(err:CustomError::OutOfRange)) |
144 | .parse_next(input) |
145 | } |
146 | |
147 | // ;; Multiline Basic String |
148 | |
149 | // ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body |
150 | // ml-basic-string-delim |
151 | fn ml_basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
152 | traceimpl Parser, …>( |
153 | name:"ml-basic-string" , |
154 | parser:delimited( |
155 | ML_BASIC_STRING_DELIM, |
156 | parser:preceded(opt(newline), cut_err(ml_basic_body)), |
157 | ignored2:cut_err(ML_BASIC_STRING_DELIM), |
158 | ) |
159 | .context(StrContext::Label("multiline basic string" )), |
160 | ) |
161 | .parse_next(input) |
162 | } |
163 | |
164 | // ml-basic-string-delim = 3quotation-mark |
165 | pub(crate) const ML_BASIC_STRING_DELIM: &[u8] = b" \"\"\"" ; |
166 | |
167 | // ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ] |
168 | fn ml_basic_body<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
169 | let mut c = Cow::Borrowed("" ); |
170 | if let Some(ci) = opt(mlb_content).parse_next(input)? { |
171 | c = ci; |
172 | } |
173 | while let Some(ci) = opt(mlb_content).parse_next(input)? { |
174 | c.to_mut().push_str(&ci); |
175 | } |
176 | |
177 | while let Some(qi) = opt(mlb_quotes(none_of(b' \"' ).value(()))).parse_next(input)? { |
178 | if let Some(ci) = opt(mlb_content).parse_next(input)? { |
179 | c.to_mut().push_str(qi); |
180 | c.to_mut().push_str(&ci); |
181 | while let Some(ci) = opt(mlb_content).parse_next(input)? { |
182 | c.to_mut().push_str(&ci); |
183 | } |
184 | } else { |
185 | break; |
186 | } |
187 | } |
188 | |
189 | if let Some(qi) = opt(mlb_quotes(ML_BASIC_STRING_DELIM.void())).parse_next(input)? { |
190 | c.to_mut().push_str(qi); |
191 | } |
192 | |
193 | Ok(c) |
194 | } |
195 | |
196 | // mlb-content = mlb-char / newline / mlb-escaped-nl |
197 | // mlb-char = mlb-unescaped / escaped |
198 | fn mlb_content<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
199 | altimpl Parser, …>(( |
200 | // Deviate from the official grammar by batching the unescaped chars so we build a string a |
201 | // chunk at a time, rather than a `char` at a time. |
202 | take_whileTryMap, …>, …, …, …, …, …, …>(occurrences:1.., MLB_UNESCAPED) |
203 | .try_map(std::str::from_utf8) |
204 | .map(Cow::Borrowed), |
205 | // Order changed fromg grammar so `escaped` can more easily `cut_err` on bad escape sequences |
206 | mlb_escaped_nl.map(|_| Cow::Borrowed("" )), |
207 | escaped.map(|c: char| Cow::Owned(String::from(c))), |
208 | newline.map(|_| Cow::Borrowed(" \n" )), |
209 | )) |
210 | .parse_next(input) |
211 | } |
212 | |
213 | // mlb-quotes = 1*2quotation-mark |
214 | fn mlb_quotes<'i>( |
215 | mut term: impl winnow::Parser<Input<'i>, (), ContextError>, |
216 | ) -> impl Parser<Input<'i>, &'i str, ContextError> { |
217 | move |input: &mut Input<'i>| { |
218 | let start: Checkpoint, …> = input.checkpoint(); |
219 | let res: Result<&str, ErrMode> = terminatedMap, …>, …, …, …, …, …>(parser:b" \"\"" , ignored:peek(parser:term.by_ref())) |
220 | .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII" ) }) |
221 | .parse_next(input); |
222 | |
223 | match res { |
224 | Err(winnow::error::ErrMode::Backtrack(_)) => { |
225 | input.reset(&start); |
226 | terminatedMap, …>, …, …, …, …, …>(parser:b" \"" , ignored:peek(parser:term.by_ref())) |
227 | .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII" ) }) |
228 | .parse_next(input) |
229 | } |
230 | res: Result<&str, ErrMode> => res, |
231 | } |
232 | } |
233 | } |
234 | |
235 | // mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii |
236 | pub(crate) const MLB_UNESCAPED: ( |
237 | (u8, u8), |
238 | u8, |
239 | RangeInclusive<u8>, |
240 | RangeInclusive<u8>, |
241 | RangeInclusive<u8>, |
242 | ) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII); |
243 | |
244 | // mlb-escaped-nl = escape ws newline *( wschar / newline |
245 | // When the last non-whitespace character on a line is a \, |
246 | // it will be trimmed along with all whitespace |
247 | // (including newlines) up to the next non-whitespace |
248 | // character or closing delimiter. |
249 | fn mlb_escaped_nl(input: &mut Input<'_>) -> PResult<()> { |
250 | repeatValue(1.., (ESCAPE, ws, ws_newlines)) |
251 | .map(|()| ()) |
252 | .value(()) |
253 | .parse_next(input) |
254 | } |
255 | |
256 | // ;; Literal String |
257 | |
258 | // literal-string = apostrophe *literal-char apostrophe |
259 | pub(crate) fn literal_string<'i>(input: &mut Input<'i>) -> PResult<&'i str> { |
260 | traceimpl Parser, …>( |
261 | name:"literal-string" , |
262 | parser:delimited( |
263 | APOSTROPHE, |
264 | parser:cut_err(take_while(0.., LITERAL_CHAR)), |
265 | ignored2:cut_err(APOSTROPHE), |
266 | ) |
267 | .try_map(std::str::from_utf8) |
268 | .context(StrContext::Label("literal string" )), |
269 | ) |
270 | .parse_next(input) |
271 | } |
272 | |
273 | // apostrophe = %x27 ; ' apostrophe |
274 | pub(crate) const APOSTROPHE: u8 = b' \'' ; |
275 | |
276 | // literal-char = %x09 / %x20-26 / %x28-7E / non-ascii |
277 | pub(crate) const LITERAL_CHAR: ( |
278 | u8, |
279 | RangeInclusive<u8>, |
280 | RangeInclusive<u8>, |
281 | RangeInclusive<u8>, |
282 | ) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII); |
283 | |
284 | // ;; Multiline Literal String |
285 | |
286 | // ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body |
287 | // ml-literal-string-delim |
288 | fn ml_literal_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
289 | traceimpl Parser, …>( |
290 | name:"ml-literal-string" , |
291 | parser:delimited( |
292 | (ML_LITERAL_STRING_DELIM, opt(newline)), |
293 | parser:cut_err(ml_literal_body.map(|t| { |
294 | if t.contains(" \r\n" ) { |
295 | Cow::Owned(t.replace(" \r\n" , " \n" )) |
296 | } else { |
297 | Cow::Borrowed(t) |
298 | } |
299 | })), |
300 | ignored2:cut_err(ML_LITERAL_STRING_DELIM), |
301 | ) |
302 | .context(StrContext::Label("multiline literal string" )), |
303 | ) |
304 | .parse_next(input) |
305 | } |
306 | |
307 | // ml-literal-string-delim = 3apostrophe |
308 | pub(crate) const ML_LITERAL_STRING_DELIM: &[u8] = b"'''" ; |
309 | |
310 | // ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ] |
311 | fn ml_literal_body<'i>(input: &mut Input<'i>) -> PResult<&'i str> { |
312 | ( |
313 | repeat(occurrences:0.., parser:mll_content).map(|()| ()), |
314 | repeatRepeat<(impl Parser, …>, …), …, …, …, …>( |
315 | occurrences:0.., |
316 | ( |
317 | mll_quotes(term:none_of(APOSTROPHE).value(())), |
318 | repeat(occurrences:1.., parser:mll_content).map(|()| ()), |
319 | ), |
320 | ) |
321 | .map(|()| ()), |
322 | opt(parser:mll_quotes(ML_LITERAL_STRING_DELIM.void())), |
323 | ) |
324 | .recognize() |
325 | .try_map(std::str::from_utf8) |
326 | .parse_next(input) |
327 | } |
328 | |
329 | // mll-content = mll-char / newline |
330 | fn mll_content(input: &mut Input<'_>) -> PResult<u8> { |
331 | alt((one_of(MLL_CHAR), newline)).parse_next(input) |
332 | } |
333 | |
334 | // mll-char = %x09 / %x20-26 / %x28-7E / non-ascii |
335 | const MLL_CHAR: ( |
336 | u8, |
337 | RangeInclusive<u8>, |
338 | RangeInclusive<u8>, |
339 | RangeInclusive<u8>, |
340 | ) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII); |
341 | |
342 | // mll-quotes = 1*2apostrophe |
343 | fn mll_quotes<'i>( |
344 | mut term: impl winnow::Parser<Input<'i>, (), ContextError>, |
345 | ) -> impl Parser<Input<'i>, &'i str, ContextError> { |
346 | move |input: &mut Input<'i>| { |
347 | let start: Checkpoint, …> = input.checkpoint(); |
348 | let res: Result<&str, ErrMode> = terminatedMap, …>, …, …, …, …, …>(parser:b"''" , ignored:peek(parser:term.by_ref())) |
349 | .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII" ) }) |
350 | .parse_next(input); |
351 | |
352 | match res { |
353 | Err(winnow::error::ErrMode::Backtrack(_)) => { |
354 | input.reset(&start); |
355 | terminatedMap, …>, …, …, …, …, …>(parser:b"'" , ignored:peek(parser:term.by_ref())) |
356 | .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII" ) }) |
357 | .parse_next(input) |
358 | } |
359 | res: Result<&str, ErrMode> => res, |
360 | } |
361 | } |
362 | } |
363 | |
364 | #[cfg (test)] |
365 | #[cfg (feature = "parse" )] |
366 | #[cfg (feature = "display" )] |
367 | mod test { |
368 | use super::*; |
369 | |
370 | #[test ] |
371 | fn basic_string() { |
372 | let input = |
373 | r#""I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF. \U0002070E""# ; |
374 | let expected = "I \'m a string. \"You can quote me \". Name \tJosé \nLocation \tSF. \u{2070E}" ; |
375 | let parsed = string.parse(new_input(input)); |
376 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
377 | } |
378 | |
379 | #[test ] |
380 | fn ml_basic_string() { |
381 | let cases = [ |
382 | ( |
383 | r#"""" |
384 | Roses are red |
385 | Violets are blue""""# , |
386 | r#"Roses are red |
387 | Violets are blue"# , |
388 | ), |
389 | (r#"""" \""" """"# , " \"\"\" " ), |
390 | (r#"""" \\""""# , " \\" ), |
391 | ]; |
392 | |
393 | for &(input, expected) in &cases { |
394 | let parsed = string.parse(new_input(input)); |
395 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
396 | } |
397 | |
398 | let invalid_cases = [r#"""" """# , r#"""" \""""# ]; |
399 | |
400 | for input in &invalid_cases { |
401 | let parsed = string.parse(new_input(input)); |
402 | assert!(parsed.is_err()); |
403 | } |
404 | } |
405 | |
406 | #[test ] |
407 | fn ml_basic_string_escape_ws() { |
408 | let inputs = [ |
409 | r#"""" |
410 | The quick brown \ |
411 | |
412 | |
413 | fox jumps over \ |
414 | the lazy dog.""""# , |
415 | r#""""\ |
416 | The quick brown \ |
417 | fox jumps over \ |
418 | the lazy dog.\ |
419 | """"# , |
420 | ]; |
421 | for input in &inputs { |
422 | let expected = "The quick brown fox jumps over the lazy dog." ; |
423 | let parsed = string.parse(new_input(input)); |
424 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
425 | } |
426 | let empties = [ |
427 | r#""""\ |
428 | """"# , |
429 | r#"""" |
430 | \ |
431 | \ |
432 | """"# , |
433 | ]; |
434 | for input in &empties { |
435 | let expected = "" ; |
436 | let parsed = string.parse(new_input(input)); |
437 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
438 | } |
439 | } |
440 | |
441 | #[test ] |
442 | fn literal_string() { |
443 | let inputs = [ |
444 | r"'C:\Users\nodejs\templates'" , |
445 | r"'\\ServerX\admin$\system32\'" , |
446 | r#"'Tom "Dubs" Preston-Werner'"# , |
447 | r"'<\i\c*\s*>'" , |
448 | ]; |
449 | |
450 | for input in &inputs { |
451 | let expected = &input[1..input.len() - 1]; |
452 | let parsed = string.parse(new_input(input)); |
453 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
454 | } |
455 | } |
456 | |
457 | #[test ] |
458 | fn ml_literal_string() { |
459 | let inputs = [ |
460 | r"'''I [dw]on't need \d{2} apples'''" , |
461 | r#"''''one_quote''''"# , |
462 | ]; |
463 | for input in &inputs { |
464 | let expected = &input[3..input.len() - 3]; |
465 | let parsed = string.parse(new_input(input)); |
466 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
467 | } |
468 | |
469 | let input = r#"''' |
470 | The first newline is |
471 | trimmed in raw strings. |
472 | All other whitespace |
473 | is preserved. |
474 | '''"# ; |
475 | let expected = &input[4..input.len() - 3]; |
476 | let parsed = string.parse(new_input(input)); |
477 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
478 | } |
479 | } |
480 | |