1use std::borrow::Cow;
2use std::char;
3use std::ops::RangeInclusive;
4
5use winnow::combinator::alt;
6use winnow::combinator::cut_err;
7use winnow::combinator::delimited;
8use winnow::combinator::fail;
9use winnow::combinator::opt;
10use winnow::combinator::peek;
11use winnow::combinator::preceded;
12use winnow::combinator::repeat;
13use winnow::combinator::success;
14use winnow::combinator::terminated;
15use winnow::prelude::*;
16use winnow::token::any;
17use winnow::token::none_of;
18use winnow::token::one_of;
19use winnow::token::tag;
20use winnow::token::take_while;
21
22use crate::parser::errors::CustomError;
23use crate::parser::numbers::HEXDIG;
24use crate::parser::prelude::*;
25use crate::parser::trivia::{from_utf8_unchecked, newline, ws, ws_newlines, NON_ASCII, WSCHAR};
26
27// ;; String
28
29// string = ml-basic-string / basic-string / ml-literal-string / literal-string
30pub(crate) fn string(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
31 altimpl Parser, …>((
32 ml_basic_string,
33 basic_string,
34 ml_literal_string,
35 literal_string.map(Cow::Borrowed),
36 ))
37 .parse_next(input)
38}
39
40// ;; Basic String
41
42// basic-string = quotation-mark *basic-char quotation-mark
43pub(crate) fn basic_string(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
44 let (mut input: Located<&BStr>, _) = one_of(QUOTATION_MARK).parse_next(input)?;
45
46 let mut c: Cow<'_, str> = Cow::Borrowed("");
47 if let Some((i: Located<&BStr>, ci: Cow<'_, str>)) = ok_error(res:basic_chars.parse_next(input))? {
48 input = i;
49 c = ci;
50 }
51 while let Some((i: Located<&BStr>, ci: Cow<'_, str>)) = ok_error(res:basic_chars.parse_next(input))? {
52 input = i;
53 c.to_mut().push_str(&ci);
54 }
55
56 let (input: Located<&BStr>, _) = cut_errContext, …>, …, …, …, …>(parser:one_of(QUOTATION_MARK))
57 .context(Context::Expression("basic string"))
58 .parse_next(input)?;
59
60 Ok((input, c))
61}
62
63// quotation-mark = %x22 ; "
64pub(crate) const QUOTATION_MARK: u8 = b'"';
65
66// basic-char = basic-unescaped / escaped
67fn basic_chars(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
68 altimpl Parser, …>((
69 // Deviate from the official grammar by batching the unescaped chars so we build a string a
70 // chunk at a time, rather than a `char` at a time.
71 take_whileTryMap, …>, …, …, …, …, …, …>(range:1.., BASIC_UNESCAPED)
72 .try_map(std::str::from_utf8)
73 .map(Cow::Borrowed),
74 escaped.map(|c: char| Cow::Owned(String::from(c))),
75 ))
76 .parse_next(input)
77}
78
79// basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
80pub(crate) const BASIC_UNESCAPED: (
81 (u8, u8),
82 u8,
83 RangeInclusive<u8>,
84 RangeInclusive<u8>,
85 RangeInclusive<u8>,
86) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
87
88// escaped = escape escape-seq-char
89fn escaped(input: Input<'_>) -> IResult<Input<'_>, char, ParserError<'_>> {
90 preceded(ESCAPE, second:escape_seq_char).parse_next(input)
91}
92
93// escape = %x5C ; \
94pub(crate) const ESCAPE: u8 = b'\\';
95
96// escape-seq-char = %x22 ; " quotation mark U+0022
97// escape-seq-char =/ %x5C ; \ reverse solidus U+005C
98// escape-seq-char =/ %x62 ; b backspace U+0008
99// escape-seq-char =/ %x66 ; f form feed U+000C
100// escape-seq-char =/ %x6E ; n line feed U+000A
101// escape-seq-char =/ %x72 ; r carriage return U+000D
102// escape-seq-char =/ %x74 ; t tab U+0009
103// escape-seq-char =/ %x75 4HEXDIG ; uXXXX U+XXXX
104// escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX U+XXXXXXXX
105fn escape_seq_char(input: Input<'_>) -> IResult<Input<'_>, char, ParserError<'_>> {
106 dispatch! {any;
107 b'b' => success('\u{8}'),
108 b'f' => success('\u{c}'),
109 b'n' => success('\n'),
110 b'r' => success('\r'),
111 b't' => success('\t'),
112 b'u' => cut_err(hexescape::<4>).context(Context::Expression("unicode 4-digit hex code")),
113 b'U' => cut_err(hexescape::<8>).context(Context::Expression("unicode 8-digit hex code")),
114 b'\\' => success('\\'),
115 b'"' => success('"'),
116 _ => {
117 cut_err(fail::<_, char, _>)
118 .context(Context::Expression("escape sequence"))
119 .context(Context::Expected(ParserValue::CharLiteral('b')))
120 .context(Context::Expected(ParserValue::CharLiteral('f')))
121 .context(Context::Expected(ParserValue::CharLiteral('n')))
122 .context(Context::Expected(ParserValue::CharLiteral('r')))
123 .context(Context::Expected(ParserValue::CharLiteral('t')))
124 .context(Context::Expected(ParserValue::CharLiteral('u')))
125 .context(Context::Expected(ParserValue::CharLiteral('U')))
126 .context(Context::Expected(ParserValue::CharLiteral('\\')))
127 .context(Context::Expected(ParserValue::CharLiteral('"')))
128 }
129 }
130 .parse_next(input)
131}
132
133pub(crate) fn hexescape<const N: usize>(
134 input: Input<'_>,
135) -> IResult<Input<'_>, char, ParserError<'_>> {
136 take_whileTryMap, …, …, …, …, …>, …, …, …, …, …>, …, …, …, …, …, …>(0..=N, HEXDIG)
137 .verify(|b: &[u8]| b.len() == N)
138 .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`is_ascii_digit` filters out on-ASCII") })
139 .verify_map(|s: &str| u32::from_str_radix(src:s, radix:16).ok())
140 .try_map(|h: u32| char::from_u32(h).ok_or(err:CustomError::OutOfRange))
141 .parse_next(input)
142}
143
144// ;; Multiline Basic String
145
146// ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
147// ml-basic-string-delim
148fn ml_basic_string(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
149 delimitedContext, …>, …, …, …, …>(
150 ML_BASIC_STRING_DELIM,
151 second:preceded(opt(newline), cut_err(ml_basic_body)),
152 third:cut_err(ML_BASIC_STRING_DELIM),
153 )
154 .context(Context::Expression("multiline basic string"))
155 .parse_next(input)
156}
157
158// ml-basic-string-delim = 3quotation-mark
159pub(crate) const ML_BASIC_STRING_DELIM: &[u8] = b"\"\"\"";
160
161// ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ]
162fn ml_basic_body(mut input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
163 let mut c = Cow::Borrowed("");
164 if let Some((i, ci)) = ok_error(mlb_content.parse_next(input))? {
165 input = i;
166 c = ci;
167 }
168 while let Some((i, ci)) = ok_error(mlb_content.parse_next(input))? {
169 input = i;
170 c.to_mut().push_str(&ci);
171 }
172
173 while let Some((i, qi)) = ok_error(mlb_quotes(none_of(b'\"').value(())).parse_next(input))? {
174 if let Some((i, ci)) = ok_error(mlb_content.parse_next(i))? {
175 input = i;
176 c.to_mut().push_str(qi);
177 c.to_mut().push_str(&ci);
178 while let Some((i, ci)) = ok_error(mlb_content.parse_next(input))? {
179 input = i;
180 c.to_mut().push_str(&ci);
181 }
182 } else {
183 break;
184 }
185 }
186
187 if let Some((i, qi)) =
188 ok_error(mlb_quotes(tag(ML_BASIC_STRING_DELIM).value(())).parse_next(input))?
189 {
190 input = i;
191 c.to_mut().push_str(qi);
192 }
193
194 Ok((input, c))
195}
196
197// mlb-content = mlb-char / newline / mlb-escaped-nl
198// mlb-char = mlb-unescaped / escaped
199fn mlb_content(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
200 altimpl Parser, …>((
201 // Deviate from the official grammar by batching the unescaped chars so we build a string a
202 // chunk at a time, rather than a `char` at a time.
203 take_whileTryMap, …>, …, …, …, …, …, …>(range:1.., MLB_UNESCAPED)
204 .try_map(std::str::from_utf8)
205 .map(Cow::Borrowed),
206 // Order changed fromg grammar so `escaped` can more easily `cut_err` on bad escape sequences
207 mlb_escaped_nl.map(|_| Cow::Borrowed("")),
208 escaped.map(|c: char| Cow::Owned(String::from(c))),
209 newline.map(|_| Cow::Borrowed("\n")),
210 ))
211 .parse_next(input)
212}
213
214// mlb-quotes = 1*2quotation-mark
215fn mlb_quotes<'i>(
216 mut term: impl winnow::Parser<Input<'i>, (), ParserError<'i>>,
217) -> impl FnMut(Input<'i>) -> IResult<Input<'i>, &str, ParserError<'i>> {
218 move |input: Located<&BStr>| {
219 let res: Result<(Located<&BStr>, &…), …> = terminatedMap, …>, …, …, …, …, …>(first:b"\"\"", second:peek(term.by_ref()))
220 .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII") })
221 .parse_next(input);
222
223 match res {
224 Err(winnow::error::ErrMode::Backtrack(_)) => terminatedMap, …>, …, …, …, …, …>(first:b"\"", second:peek(term.by_ref()))
225 .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII") })
226 .parse_next(input),
227 res: Result<(Located<&BStr>, &…), …> => res,
228 }
229 }
230}
231
232// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
233pub(crate) const MLB_UNESCAPED: (
234 (u8, u8),
235 u8,
236 RangeInclusive<u8>,
237 RangeInclusive<u8>,
238 RangeInclusive<u8>,
239) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
240
241// mlb-escaped-nl = escape ws newline *( wschar / newline
242// When the last non-whitespace character on a line is a \,
243// it will be trimmed along with all whitespace
244// (including newlines) up to the next non-whitespace
245// character or closing delimiter.
246fn mlb_escaped_nl(input: Input<'_>) -> IResult<Input<'_>, (), ParserError<'_>> {
247 repeatValue, …>, …, …, …, …, …>, …, …, …, …>(1.., (ESCAPE, ws, ws_newlines))
248 .map(|()| ())
249 .value(())
250 .parse_next(input)
251}
252
253// ;; Literal String
254
255// literal-string = apostrophe *literal-char apostrophe
256pub(crate) fn literal_string(input: Input<'_>) -> IResult<Input<'_>, &str, ParserError<'_>> {
257 delimitedContext, …, …, …, …, …, …>, …, …, …, …>(
258 APOSTROPHE,
259 second:cut_err(take_while(0.., LITERAL_CHAR)),
260 third:cut_err(APOSTROPHE),
261 )
262 .try_map(std::str::from_utf8)
263 .context(Context::Expression("literal string"))
264 .parse_next(input)
265}
266
267// apostrophe = %x27 ; ' apostrophe
268pub(crate) const APOSTROPHE: u8 = b'\'';
269
270// literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
271pub(crate) const LITERAL_CHAR: (
272 u8,
273 RangeInclusive<u8>,
274 RangeInclusive<u8>,
275 RangeInclusive<u8>,
276) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
277
278// ;; Multiline Literal String
279
280// ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body
281// ml-literal-string-delim
282fn ml_literal_string(input: Input<'_>) -> IResult<Input<'_>, Cow<'_, str>, ParserError<'_>> {
283 delimitedContext, …>, …, …, …, …>(
284 (ML_LITERAL_STRING_DELIM, opt(newline)),
285 second:cut_err(ml_literal_body.map(|t| {
286 if t.contains("\r\n") {
287 Cow::Owned(t.replace("\r\n", "\n"))
288 } else {
289 Cow::Borrowed(t)
290 }
291 })),
292 third:cut_err(ML_LITERAL_STRING_DELIM),
293 )
294 .context(Context::Expression("multiline literal string"))
295 .parse_next(input)
296}
297
298// ml-literal-string-delim = 3apostrophe
299pub(crate) const ML_LITERAL_STRING_DELIM: &[u8] = b"'''";
300
301// ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ]
302fn ml_literal_body(input: Input<'_>) -> IResult<Input<'_>, &str, ParserError<'_>> {
303 (
304 repeat(range:0.., f:mll_content).map(|()| ()),
305 repeatimpl Parser, …>(
306 range:0..,
307 (
308 mll_quotes(term:none_of(APOSTROPHE).value(())),
309 repeat(range:1.., f:mll_content).map(|()| ()),
310 ),
311 )
312 .map(|()| ()),
313 opt(mll_quotes(term:tag(ML_LITERAL_STRING_DELIM).value(()))),
314 )
315 .recognize()
316 .try_map(std::str::from_utf8)
317 .parse_next(input)
318}
319
320// mll-content = mll-char / newline
321fn mll_content(input: Input<'_>) -> IResult<Input<'_>, u8, ParserError<'_>> {
322 alt((one_of(MLL_CHAR), newline)).parse_next(input)
323}
324
325// mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
326const MLL_CHAR: (
327 u8,
328 RangeInclusive<u8>,
329 RangeInclusive<u8>,
330 RangeInclusive<u8>,
331) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
332
333// mll-quotes = 1*2apostrophe
334fn mll_quotes<'i>(
335 mut term: impl winnow::Parser<Input<'i>, (), ParserError<'i>>,
336) -> impl FnMut(Input<'i>) -> IResult<Input<'i>, &str, ParserError<'i>> {
337 move |input: Located<&BStr>| {
338 let res: Result<(Located<&BStr>, &…), …> = terminatedMap, …>, …, …, …, …, …>(first:b"''", second:peek(term.by_ref()))
339 .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII") })
340 .parse_next(input);
341
342 match res {
343 Err(winnow::error::ErrMode::Backtrack(_)) => terminatedMap, …>, …, …, …, …, …>(first:b"'", second:peek(term.by_ref()))
344 .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII") })
345 .parse_next(input),
346 res: Result<(Located<&BStr>, &…), …> => res,
347 }
348 }
349}
350
351#[cfg(test)]
352mod test {
353 use super::*;
354
355 #[test]
356 fn basic_string() {
357 let input =
358 r#""I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF. \U0002070E""#;
359 let expected = "I\'m a string. \"You can quote me\". Name\tJosé\nLocation\tSF. \u{2070E}";
360 let parsed = string.parse(new_input(input));
361 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
362 }
363
364 #[test]
365 fn ml_basic_string() {
366 let cases = [
367 (
368 r#""""
369Roses are red
370Violets are blue""""#,
371 r#"Roses are red
372Violets are blue"#,
373 ),
374 (r#"""" \""" """"#, " \"\"\" "),
375 (r#"""" \\""""#, " \\"),
376 ];
377
378 for &(input, expected) in &cases {
379 let parsed = string.parse(new_input(input));
380 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
381 }
382
383 let invalid_cases = [r#"""" """#, r#"""" \""""#];
384
385 for input in &invalid_cases {
386 let parsed = string.parse(new_input(input));
387 assert!(parsed.is_err());
388 }
389 }
390
391 #[test]
392 fn ml_basic_string_escape_ws() {
393 let inputs = [
394 r#""""
395The quick brown \
396
397
398 fox jumps over \
399 the lazy dog.""""#,
400 r#""""\
401 The quick brown \
402 fox jumps over \
403 the lazy dog.\
404 """"#,
405 ];
406 for input in &inputs {
407 let expected = "The quick brown fox jumps over the lazy dog.";
408 let parsed = string.parse(new_input(input));
409 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
410 }
411 let empties = [
412 r#""""\
413 """"#,
414 r#""""
415\
416 \
417""""#,
418 ];
419 for input in &empties {
420 let expected = "";
421 let parsed = string.parse(new_input(input));
422 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
423 }
424 }
425
426 #[test]
427 fn literal_string() {
428 let inputs = [
429 r#"'C:\Users\nodejs\templates'"#,
430 r#"'\\ServerX\admin$\system32\'"#,
431 r#"'Tom "Dubs" Preston-Werner'"#,
432 r#"'<\i\c*\s*>'"#,
433 ];
434
435 for input in &inputs {
436 let expected = &input[1..input.len() - 1];
437 let parsed = string.parse(new_input(input));
438 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
439 }
440 }
441
442 #[test]
443 fn ml_literal_string() {
444 let inputs = [
445 r#"'''I [dw]on't need \d{2} apples'''"#,
446 r#"''''one_quote''''"#,
447 ];
448 for input in &inputs {
449 let expected = &input[3..input.len() - 3];
450 let parsed = string.parse(new_input(input));
451 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
452 }
453
454 let input = r#"'''
455The first newline is
456trimmed in raw strings.
457 All other whitespace
458 is preserved.
459'''"#;
460 let expected = &input[4..input.len() - 3];
461 let parsed = string.parse(new_input(input));
462 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
463 }
464}
465