1 | use std::borrow::Cow; |
2 | use std::char; |
3 | use std::ops::RangeInclusive; |
4 | |
5 | use winnow::combinator::alt; |
6 | use winnow::combinator::cut_err; |
7 | use winnow::combinator::delimited; |
8 | use winnow::combinator::fail; |
9 | use winnow::combinator::opt; |
10 | use winnow::combinator::peek; |
11 | use winnow::combinator::preceded; |
12 | use winnow::combinator::repeat; |
13 | use winnow::combinator::success; |
14 | use winnow::combinator::terminated; |
15 | use winnow::prelude::*; |
16 | use winnow::stream::Stream; |
17 | use winnow::token::any; |
18 | use winnow::token::none_of; |
19 | use winnow::token::one_of; |
20 | use winnow::token::tag; |
21 | use winnow::token::take_while; |
22 | use winnow::trace::trace; |
23 | |
24 | use crate::parser::errors::CustomError; |
25 | use crate::parser::numbers::HEXDIG; |
26 | use crate::parser::prelude::*; |
27 | use crate::parser::trivia::{from_utf8_unchecked, newline, ws, ws_newlines, NON_ASCII, WSCHAR}; |
28 | |
29 | // ;; String |
30 | |
31 | // string = ml-basic-string / basic-string / ml-literal-string / literal-string |
32 | pub(crate) fn string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
33 | traceimpl Parser, …>( |
34 | name:"string" , |
35 | parser:alt(( |
36 | ml_basic_string, |
37 | basic_string, |
38 | ml_literal_string, |
39 | literal_string.map(Cow::Borrowed), |
40 | )), |
41 | ) |
42 | .parse_next(input) |
43 | } |
44 | |
45 | // ;; Basic String |
46 | |
47 | // basic-string = quotation-mark *basic-char quotation-mark |
48 | pub(crate) fn basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
49 | traceimpl Parser, …>(name:"basic-string" , |input: &mut Input<'i>| { |
50 | let _ = one_of(QUOTATION_MARK).parse_next(input)?; |
51 | |
52 | let mut c: Cow<'_, str> = Cow::Borrowed("" ); |
53 | if let Some(ci: Cow<'_, str>) = opt(basic_chars).parse_next(input)? { |
54 | c = ci; |
55 | } |
56 | while let Some(ci: Cow<'_, str>) = opt(basic_chars).parse_next(input)? { |
57 | c.to_mut().push_str(&ci); |
58 | } |
59 | |
60 | let _ = cut_errContext, …>, …, …, …, …>(parser:one_of(QUOTATION_MARK)) |
61 | .context(StrContext::Label("basic string" )) |
62 | .parse_next(input)?; |
63 | |
64 | Ok(c) |
65 | }) |
66 | .parse_next(input) |
67 | } |
68 | |
69 | // quotation-mark = %x22 ; " |
70 | pub(crate) const QUOTATION_MARK: u8 = b'"' ; |
71 | |
72 | // basic-char = basic-unescaped / escaped |
73 | fn basic_chars<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
74 | altimpl Parser, …>(( |
75 | // Deviate from the official grammar by batching the unescaped chars so we build a string a |
76 | // chunk at a time, rather than a `char` at a time. |
77 | take_whileTryMap, …>, …, …, …, …, …, …>(range:1.., BASIC_UNESCAPED) |
78 | .try_map(std::str::from_utf8) |
79 | .map(Cow::Borrowed), |
80 | escaped.map(|c: char| Cow::Owned(String::from(c))), |
81 | )) |
82 | .parse_next(input) |
83 | } |
84 | |
85 | // basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii |
86 | pub(crate) const BASIC_UNESCAPED: ( |
87 | (u8, u8), |
88 | u8, |
89 | RangeInclusive<u8>, |
90 | RangeInclusive<u8>, |
91 | RangeInclusive<u8>, |
92 | ) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII); |
93 | |
94 | // escaped = escape escape-seq-char |
95 | fn escaped(input: &mut Input<'_>) -> PResult<char> { |
96 | preceded(ESCAPE, second:escape_seq_char).parse_next(input) |
97 | } |
98 | |
99 | // escape = %x5C ; \ |
100 | pub(crate) const ESCAPE: u8 = b' \\' ; |
101 | |
102 | // escape-seq-char = %x22 ; " quotation mark U+0022 |
103 | // escape-seq-char =/ %x5C ; \ reverse solidus U+005C |
104 | // escape-seq-char =/ %x62 ; b backspace U+0008 |
105 | // escape-seq-char =/ %x66 ; f form feed U+000C |
106 | // escape-seq-char =/ %x6E ; n line feed U+000A |
107 | // escape-seq-char =/ %x72 ; r carriage return U+000D |
108 | // escape-seq-char =/ %x74 ; t tab U+0009 |
109 | // escape-seq-char =/ %x75 4HEXDIG ; uXXXX U+XXXX |
110 | // escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX U+XXXXXXXX |
111 | fn escape_seq_char(input: &mut Input<'_>) -> PResult<char> { |
112 | dispatch! {any; |
113 | b'b' => success(' \u{8}' ), |
114 | b'f' => success(' \u{c}' ), |
115 | b'n' => success(' \n' ), |
116 | b'r' => success(' \r' ), |
117 | b't' => success(' \t' ), |
118 | b'u' => cut_err(hexescape::<4>).context(StrContext::Label("unicode 4-digit hex code" )), |
119 | b'U' => cut_err(hexescape::<8>).context(StrContext::Label("unicode 8-digit hex code" )), |
120 | b' \\' => success(' \\' ), |
121 | b'"' => success('"' ), |
122 | _ => { |
123 | cut_err(fail::<_, char, _>) |
124 | .context(StrContext::Label("escape sequence" )) |
125 | .context(StrContext::Expected(StrContextValue::CharLiteral('b' ))) |
126 | .context(StrContext::Expected(StrContextValue::CharLiteral('f' ))) |
127 | .context(StrContext::Expected(StrContextValue::CharLiteral('n' ))) |
128 | .context(StrContext::Expected(StrContextValue::CharLiteral('r' ))) |
129 | .context(StrContext::Expected(StrContextValue::CharLiteral('t' ))) |
130 | .context(StrContext::Expected(StrContextValue::CharLiteral('u' ))) |
131 | .context(StrContext::Expected(StrContextValue::CharLiteral('U' ))) |
132 | .context(StrContext::Expected(StrContextValue::CharLiteral(' \\' ))) |
133 | .context(StrContext::Expected(StrContextValue::CharLiteral('"' ))) |
134 | } |
135 | } |
136 | .parse_next(input) |
137 | } |
138 | |
139 | pub(crate) fn hexescape<const N: usize>(input: &mut Input<'_>) -> PResult<char> { |
140 | take_whileTryMap, …, …, …, …, …>, …, …, …, …, …>, …, …, …, …, …, …>(0..=N, HEXDIG) |
141 | .verify(|b: &[u8]| b.len() == N) |
142 | .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`is_ascii_digit` filters out on-ASCII" ) }) |
143 | .verify_map(|s: &str| u32::from_str_radix(src:s, radix:16).ok()) |
144 | .try_map(|h: u32| char::from_u32(h).ok_or(err:CustomError::OutOfRange)) |
145 | .parse_next(input) |
146 | } |
147 | |
148 | // ;; Multiline Basic String |
149 | |
150 | // ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body |
151 | // ml-basic-string-delim |
152 | fn ml_basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
153 | traceimpl Parser, …>( |
154 | name:"ml-basic-string" , |
155 | parser:delimited( |
156 | ML_BASIC_STRING_DELIM, |
157 | second:preceded(opt(newline), cut_err(ml_basic_body)), |
158 | third:cut_err(ML_BASIC_STRING_DELIM), |
159 | ) |
160 | .context(StrContext::Label("multiline basic string" )), |
161 | ) |
162 | .parse_next(input) |
163 | } |
164 | |
165 | // ml-basic-string-delim = 3quotation-mark |
166 | pub(crate) const ML_BASIC_STRING_DELIM: &[u8] = b" \"\"\"" ; |
167 | |
168 | // ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ] |
169 | fn ml_basic_body<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
170 | let mut c = Cow::Borrowed("" ); |
171 | if let Some(ci) = opt(mlb_content).parse_next(input)? { |
172 | c = ci; |
173 | } |
174 | while let Some(ci) = opt(mlb_content).parse_next(input)? { |
175 | c.to_mut().push_str(&ci); |
176 | } |
177 | |
178 | while let Some(qi) = opt(mlb_quotes(none_of(b' \"' ).value(()))).parse_next(input)? { |
179 | if let Some(ci) = opt(mlb_content).parse_next(input)? { |
180 | c.to_mut().push_str(qi); |
181 | c.to_mut().push_str(&ci); |
182 | while let Some(ci) = opt(mlb_content).parse_next(input)? { |
183 | c.to_mut().push_str(&ci); |
184 | } |
185 | } else { |
186 | break; |
187 | } |
188 | } |
189 | |
190 | if let Some(qi) = opt(mlb_quotes(tag(ML_BASIC_STRING_DELIM).value(()))).parse_next(input)? { |
191 | c.to_mut().push_str(qi); |
192 | } |
193 | |
194 | Ok(c) |
195 | } |
196 | |
197 | // mlb-content = mlb-char / newline / mlb-escaped-nl |
198 | // mlb-char = mlb-unescaped / escaped |
199 | fn mlb_content<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
200 | altimpl Parser, …>(( |
201 | // Deviate from the official grammar by batching the unescaped chars so we build a string a |
202 | // chunk at a time, rather than a `char` at a time. |
203 | take_whileTryMap, …>, …, …, …, …, …, …>(range:1.., MLB_UNESCAPED) |
204 | .try_map(std::str::from_utf8) |
205 | .map(Cow::Borrowed), |
206 | // Order changed fromg grammar so `escaped` can more easily `cut_err` on bad escape sequences |
207 | mlb_escaped_nl.map(|_| Cow::Borrowed("" )), |
208 | escaped.map(|c: char| Cow::Owned(String::from(c))), |
209 | newline.map(|_| Cow::Borrowed(" \n" )), |
210 | )) |
211 | .parse_next(input) |
212 | } |
213 | |
214 | // mlb-quotes = 1*2quotation-mark |
215 | fn mlb_quotes<'i>( |
216 | mut term: impl winnow::Parser<Input<'i>, (), ContextError>, |
217 | ) -> impl Parser<Input<'i>, &'i str, ContextError> { |
218 | move |input: &mut Input<'i>| { |
219 | let start: Checkpoint> = input.checkpoint(); |
220 | let res: Result<&str, ErrMode> = terminatedMap, …>, …, …, …, …, …>(first:b" \"\"" , second:peek(term.by_ref())) |
221 | .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII" ) }) |
222 | .parse_next(input); |
223 | |
224 | match res { |
225 | Err(winnow::error::ErrMode::Backtrack(_)) => { |
226 | input.reset(checkpoint:start); |
227 | terminatedMap, …>, …, …, …, …, …>(first:b" \"" , second:peek(term.by_ref())) |
228 | .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII" ) }) |
229 | .parse_next(input) |
230 | } |
231 | res: Result<&str, ErrMode> => res, |
232 | } |
233 | } |
234 | } |
235 | |
236 | // mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii |
237 | pub(crate) const MLB_UNESCAPED: ( |
238 | (u8, u8), |
239 | u8, |
240 | RangeInclusive<u8>, |
241 | RangeInclusive<u8>, |
242 | RangeInclusive<u8>, |
243 | ) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII); |
244 | |
245 | // mlb-escaped-nl = escape ws newline *( wschar / newline |
246 | // When the last non-whitespace character on a line is a \, |
247 | // it will be trimmed along with all whitespace |
248 | // (including newlines) up to the next non-whitespace |
249 | // character or closing delimiter. |
250 | fn mlb_escaped_nl(input: &mut Input<'_>) -> PResult<()> { |
251 | repeatValue(1.., (ESCAPE, ws, ws_newlines)) |
252 | .map(|()| ()) |
253 | .value(()) |
254 | .parse_next(input) |
255 | } |
256 | |
257 | // ;; Literal String |
258 | |
259 | // literal-string = apostrophe *literal-char apostrophe |
260 | pub(crate) fn literal_string<'i>(input: &mut Input<'i>) -> PResult<&'i str> { |
261 | traceimpl Parser, …>( |
262 | name:"literal-string" , |
263 | parser:delimited( |
264 | APOSTROPHE, |
265 | second:cut_err(take_while(0.., LITERAL_CHAR)), |
266 | third:cut_err(APOSTROPHE), |
267 | ) |
268 | .try_map(std::str::from_utf8) |
269 | .context(StrContext::Label("literal string" )), |
270 | ) |
271 | .parse_next(input) |
272 | } |
273 | |
274 | // apostrophe = %x27 ; ' apostrophe |
275 | pub(crate) const APOSTROPHE: u8 = b' \'' ; |
276 | |
277 | // literal-char = %x09 / %x20-26 / %x28-7E / non-ascii |
278 | pub(crate) const LITERAL_CHAR: ( |
279 | u8, |
280 | RangeInclusive<u8>, |
281 | RangeInclusive<u8>, |
282 | RangeInclusive<u8>, |
283 | ) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII); |
284 | |
285 | // ;; Multiline Literal String |
286 | |
287 | // ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body |
288 | // ml-literal-string-delim |
289 | fn ml_literal_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> { |
290 | traceimpl Parser, …>( |
291 | name:"ml-literal-string" , |
292 | parser:delimited( |
293 | (ML_LITERAL_STRING_DELIM, opt(newline)), |
294 | second:cut_err(ml_literal_body.map(|t| { |
295 | if t.contains(" \r\n" ) { |
296 | Cow::Owned(t.replace(" \r\n" , " \n" )) |
297 | } else { |
298 | Cow::Borrowed(t) |
299 | } |
300 | })), |
301 | third:cut_err(ML_LITERAL_STRING_DELIM), |
302 | ) |
303 | .context(StrContext::Label("multiline literal string" )), |
304 | ) |
305 | .parse_next(input) |
306 | } |
307 | |
308 | // ml-literal-string-delim = 3apostrophe |
309 | pub(crate) const ML_LITERAL_STRING_DELIM: &[u8] = b"'''" ; |
310 | |
311 | // ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ] |
312 | fn ml_literal_body<'i>(input: &mut Input<'i>) -> PResult<&'i str> { |
313 | ( |
314 | repeat(range:0.., parser:mll_content).map(|()| ()), |
315 | repeatRepeat<(impl Parser, …>, …), …, …, …, …>( |
316 | range:0.., |
317 | ( |
318 | mll_quotes(term:none_of(APOSTROPHE).value(())), |
319 | repeat(range:1.., parser:mll_content).map(|()| ()), |
320 | ), |
321 | ) |
322 | .map(|()| ()), |
323 | opt(mll_quotes(term:tag(ML_LITERAL_STRING_DELIM).value(()))), |
324 | ) |
325 | .recognize() |
326 | .try_map(std::str::from_utf8) |
327 | .parse_next(input) |
328 | } |
329 | |
330 | // mll-content = mll-char / newline |
331 | fn mll_content(input: &mut Input<'_>) -> PResult<u8> { |
332 | alt((one_of(MLL_CHAR), newline)).parse_next(input) |
333 | } |
334 | |
335 | // mll-char = %x09 / %x20-26 / %x28-7E / non-ascii |
336 | const MLL_CHAR: ( |
337 | u8, |
338 | RangeInclusive<u8>, |
339 | RangeInclusive<u8>, |
340 | RangeInclusive<u8>, |
341 | ) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII); |
342 | |
343 | // mll-quotes = 1*2apostrophe |
344 | fn mll_quotes<'i>( |
345 | mut term: impl winnow::Parser<Input<'i>, (), ContextError>, |
346 | ) -> impl Parser<Input<'i>, &'i str, ContextError> { |
347 | move |input: &mut Input<'i>| { |
348 | let start: Checkpoint> = input.checkpoint(); |
349 | let res: Result<&str, ErrMode> = terminatedMap, …>, …, …, …, …, …>(first:b"''" , second:peek(term.by_ref())) |
350 | .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII" ) }) |
351 | .parse_next(input); |
352 | |
353 | match res { |
354 | Err(winnow::error::ErrMode::Backtrack(_)) => { |
355 | input.reset(checkpoint:start); |
356 | terminatedMap, …>, …, …, …, …, …>(first:b"'" , second:peek(term.by_ref())) |
357 | .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII" ) }) |
358 | .parse_next(input) |
359 | } |
360 | res: Result<&str, ErrMode> => res, |
361 | } |
362 | } |
363 | } |
364 | |
365 | #[cfg (test)] |
366 | mod test { |
367 | use super::*; |
368 | |
369 | #[test ] |
370 | fn basic_string() { |
371 | let input = |
372 | r#""I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF. \U0002070E""# ; |
373 | let expected = "I \'m a string. \"You can quote me \". Name \tJosé \nLocation \tSF. \u{2070E}" ; |
374 | let parsed = string.parse(new_input(input)); |
375 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
376 | } |
377 | |
378 | #[test ] |
379 | fn ml_basic_string() { |
380 | let cases = [ |
381 | ( |
382 | r#"""" |
383 | Roses are red |
384 | Violets are blue""""# , |
385 | r#"Roses are red |
386 | Violets are blue"# , |
387 | ), |
388 | (r#"""" \""" """"# , " \"\"\" " ), |
389 | (r#"""" \\""""# , " \\" ), |
390 | ]; |
391 | |
392 | for &(input, expected) in &cases { |
393 | let parsed = string.parse(new_input(input)); |
394 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
395 | } |
396 | |
397 | let invalid_cases = [r#"""" """# , r#"""" \""""# ]; |
398 | |
399 | for input in &invalid_cases { |
400 | let parsed = string.parse(new_input(input)); |
401 | assert!(parsed.is_err()); |
402 | } |
403 | } |
404 | |
405 | #[test ] |
406 | fn ml_basic_string_escape_ws() { |
407 | let inputs = [ |
408 | r#"""" |
409 | The quick brown \ |
410 | |
411 | |
412 | fox jumps over \ |
413 | the lazy dog.""""# , |
414 | r#""""\ |
415 | The quick brown \ |
416 | fox jumps over \ |
417 | the lazy dog.\ |
418 | """"# , |
419 | ]; |
420 | for input in &inputs { |
421 | let expected = "The quick brown fox jumps over the lazy dog." ; |
422 | let parsed = string.parse(new_input(input)); |
423 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
424 | } |
425 | let empties = [ |
426 | r#""""\ |
427 | """"# , |
428 | r#"""" |
429 | \ |
430 | \ |
431 | """"# , |
432 | ]; |
433 | for input in &empties { |
434 | let expected = "" ; |
435 | let parsed = string.parse(new_input(input)); |
436 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
437 | } |
438 | } |
439 | |
440 | #[test ] |
441 | fn literal_string() { |
442 | let inputs = [ |
443 | r#"'C:\Users\nodejs\templates'"# , |
444 | r#"'\\ServerX\admin$\system32\'"# , |
445 | r#"'Tom "Dubs" Preston-Werner'"# , |
446 | r#"'<\i\c*\s*>'"# , |
447 | ]; |
448 | |
449 | for input in &inputs { |
450 | let expected = &input[1..input.len() - 1]; |
451 | let parsed = string.parse(new_input(input)); |
452 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
453 | } |
454 | } |
455 | |
456 | #[test ] |
457 | fn ml_literal_string() { |
458 | let inputs = [ |
459 | r#"'''I [dw]on't need \d{2} apples'''"# , |
460 | r#"''''one_quote''''"# , |
461 | ]; |
462 | for input in &inputs { |
463 | let expected = &input[3..input.len() - 3]; |
464 | let parsed = string.parse(new_input(input)); |
465 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
466 | } |
467 | |
468 | let input = r#"''' |
469 | The first newline is |
470 | trimmed in raw strings. |
471 | All other whitespace |
472 | is preserved. |
473 | '''"# ; |
474 | let expected = &input[4..input.len() - 3]; |
475 | let parsed = string.parse(new_input(input)); |
476 | assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}" ); |
477 | } |
478 | } |
479 | |