1use std::borrow::Cow;
2use std::char;
3use std::ops::RangeInclusive;
4
5use winnow::combinator::alt;
6use winnow::combinator::cut_err;
7use winnow::combinator::delimited;
8use winnow::combinator::empty;
9use winnow::combinator::fail;
10use winnow::combinator::opt;
11use winnow::combinator::peek;
12use winnow::combinator::preceded;
13use winnow::combinator::repeat;
14use winnow::combinator::terminated;
15use winnow::combinator::trace;
16use winnow::prelude::*;
17use winnow::stream::Stream;
18use winnow::token::any;
19use winnow::token::none_of;
20use winnow::token::one_of;
21use winnow::token::take_while;
22
23use crate::parser::error::CustomError;
24use crate::parser::numbers::HEXDIG;
25use crate::parser::prelude::*;
26use crate::parser::trivia::{from_utf8_unchecked, newline, ws, ws_newlines, NON_ASCII, WSCHAR};
27
28// ;; String
29
30// string = ml-basic-string / basic-string / ml-literal-string / literal-string
31pub(crate) fn string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
32 traceimpl Parser, …>(
33 name:"string",
34 parser:alt((
35 ml_basic_string,
36 basic_string,
37 ml_literal_string,
38 literal_string.map(Cow::Borrowed),
39 )),
40 )
41 .parse_next(input)
42}
43
44// ;; Basic String
45
46// basic-string = quotation-mark *basic-char quotation-mark
47pub(crate) fn basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
48 traceimpl Parser, …>(name:"basic-string", |input: &mut Input<'i>| {
49 let _ = one_of(QUOTATION_MARK).parse_next(input)?;
50
51 let mut c: Cow<'_, str> = Cow::Borrowed("");
52 if let Some(ci: Cow<'_, str>) = opt(parser:basic_chars).parse_next(input)? {
53 c = ci;
54 }
55 while let Some(ci: Cow<'_, str>) = opt(parser:basic_chars).parse_next(input)? {
56 c.to_mut().push_str(&ci);
57 }
58
59 let _ = cut_errContext, …>, …, …, …, …>(parser:one_of(QUOTATION_MARK))
60 .context(StrContext::Label("basic string"))
61 .parse_next(input)?;
62
63 Ok(c)
64 })
65 .parse_next(input)
66}
67
68// quotation-mark = %x22 ; "
69pub(crate) const QUOTATION_MARK: u8 = b'"';
70
71// basic-char = basic-unescaped / escaped
72fn basic_chars<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
73 altimpl Parser, …>((
74 // Deviate from the official grammar by batching the unescaped chars so we build a string a
75 // chunk at a time, rather than a `char` at a time.
76 take_whileTryMap, …>, …, …, …, …, …, …>(occurrences:1.., BASIC_UNESCAPED)
77 .try_map(std::str::from_utf8)
78 .map(Cow::Borrowed),
79 escaped.map(|c: char| Cow::Owned(String::from(c))),
80 ))
81 .parse_next(input)
82}
83
84// basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
85pub(crate) const BASIC_UNESCAPED: (
86 (u8, u8),
87 u8,
88 RangeInclusive<u8>,
89 RangeInclusive<u8>,
90 RangeInclusive<u8>,
91) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
92
93// escaped = escape escape-seq-char
94fn escaped(input: &mut Input<'_>) -> PResult<char> {
95 preceded(ESCAPE, parser:escape_seq_char).parse_next(input)
96}
97
98// escape = %x5C ; \
99pub(crate) const ESCAPE: u8 = b'\\';
100
101// escape-seq-char = %x22 ; " quotation mark U+0022
102// escape-seq-char =/ %x5C ; \ reverse solidus U+005C
103// escape-seq-char =/ %x62 ; b backspace U+0008
104// escape-seq-char =/ %x66 ; f form feed U+000C
105// escape-seq-char =/ %x6E ; n line feed U+000A
106// escape-seq-char =/ %x72 ; r carriage return U+000D
107// escape-seq-char =/ %x74 ; t tab U+0009
108// escape-seq-char =/ %x75 4HEXDIG ; uXXXX U+XXXX
109// escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX U+XXXXXXXX
110fn escape_seq_char(input: &mut Input<'_>) -> PResult<char> {
111 dispatch! {any;
112 b'b' => empty.value('\u{8}'),
113 b'f' => empty.value('\u{c}'),
114 b'n' => empty.value('\n'),
115 b'r' => empty.value('\r'),
116 b't' => empty.value('\t'),
117 b'u' => cut_err(hexescape::<4>).context(StrContext::Label("unicode 4-digit hex code")),
118 b'U' => cut_err(hexescape::<8>).context(StrContext::Label("unicode 8-digit hex code")),
119 b'\\' => empty.value('\\'),
120 b'"' => empty.value('"'),
121 _ => {
122 cut_err(fail::<_, char, _>)
123 .context(StrContext::Label("escape sequence"))
124 .context(StrContext::Expected(StrContextValue::CharLiteral('b')))
125 .context(StrContext::Expected(StrContextValue::CharLiteral('f')))
126 .context(StrContext::Expected(StrContextValue::CharLiteral('n')))
127 .context(StrContext::Expected(StrContextValue::CharLiteral('r')))
128 .context(StrContext::Expected(StrContextValue::CharLiteral('t')))
129 .context(StrContext::Expected(StrContextValue::CharLiteral('u')))
130 .context(StrContext::Expected(StrContextValue::CharLiteral('U')))
131 .context(StrContext::Expected(StrContextValue::CharLiteral('\\')))
132 .context(StrContext::Expected(StrContextValue::CharLiteral('"')))
133 }
134 }
135 .parse_next(input)
136}
137
138pub(crate) fn hexescape<const N: usize>(input: &mut Input<'_>) -> PResult<char> {
139 take_whileTryMap, …, …, …, …, …>, …, …, …, …, …>, …, …, …, …, …, …>(0..=N, HEXDIG)
140 .verify(|b: &[u8]| b.len() == N)
141 .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`is_ascii_digit` filters out on-ASCII") })
142 .verify_map(|s: &str| u32::from_str_radix(src:s, radix:16).ok())
143 .try_map(|h: u32| char::from_u32(h).ok_or(err:CustomError::OutOfRange))
144 .parse_next(input)
145}
146
147// ;; Multiline Basic String
148
149// ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
150// ml-basic-string-delim
151fn ml_basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
152 traceimpl Parser, …>(
153 name:"ml-basic-string",
154 parser:delimited(
155 ML_BASIC_STRING_DELIM,
156 parser:preceded(opt(newline), cut_err(ml_basic_body)),
157 ignored2:cut_err(ML_BASIC_STRING_DELIM),
158 )
159 .context(StrContext::Label("multiline basic string")),
160 )
161 .parse_next(input)
162}
163
164// ml-basic-string-delim = 3quotation-mark
165pub(crate) const ML_BASIC_STRING_DELIM: &[u8] = b"\"\"\"";
166
167// ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ]
168fn ml_basic_body<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
169 let mut c = Cow::Borrowed("");
170 if let Some(ci) = opt(mlb_content).parse_next(input)? {
171 c = ci;
172 }
173 while let Some(ci) = opt(mlb_content).parse_next(input)? {
174 c.to_mut().push_str(&ci);
175 }
176
177 while let Some(qi) = opt(mlb_quotes(none_of(b'\"').value(()))).parse_next(input)? {
178 if let Some(ci) = opt(mlb_content).parse_next(input)? {
179 c.to_mut().push_str(qi);
180 c.to_mut().push_str(&ci);
181 while let Some(ci) = opt(mlb_content).parse_next(input)? {
182 c.to_mut().push_str(&ci);
183 }
184 } else {
185 break;
186 }
187 }
188
189 if let Some(qi) = opt(mlb_quotes(ML_BASIC_STRING_DELIM.void())).parse_next(input)? {
190 c.to_mut().push_str(qi);
191 }
192
193 Ok(c)
194}
195
196// mlb-content = mlb-char / newline / mlb-escaped-nl
197// mlb-char = mlb-unescaped / escaped
198fn mlb_content<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
199 altimpl Parser, …>((
200 // Deviate from the official grammar by batching the unescaped chars so we build a string a
201 // chunk at a time, rather than a `char` at a time.
202 take_whileTryMap, …>, …, …, …, …, …, …>(occurrences:1.., MLB_UNESCAPED)
203 .try_map(std::str::from_utf8)
204 .map(Cow::Borrowed),
205 // Order changed fromg grammar so `escaped` can more easily `cut_err` on bad escape sequences
206 mlb_escaped_nl.map(|_| Cow::Borrowed("")),
207 escaped.map(|c: char| Cow::Owned(String::from(c))),
208 newline.map(|_| Cow::Borrowed("\n")),
209 ))
210 .parse_next(input)
211}
212
213// mlb-quotes = 1*2quotation-mark
214fn mlb_quotes<'i>(
215 mut term: impl winnow::Parser<Input<'i>, (), ContextError>,
216) -> impl Parser<Input<'i>, &'i str, ContextError> {
217 move |input: &mut Input<'i>| {
218 let start: Checkpoint, …> = input.checkpoint();
219 let res: Result<&str, ErrMode> = terminatedMap, …>, …, …, …, …, …>(parser:b"\"\"", ignored:peek(parser:term.by_ref()))
220 .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII") })
221 .parse_next(input);
222
223 match res {
224 Err(winnow::error::ErrMode::Backtrack(_)) => {
225 input.reset(&start);
226 terminatedMap, …>, …, …, …, …, …>(parser:b"\"", ignored:peek(parser:term.by_ref()))
227 .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII") })
228 .parse_next(input)
229 }
230 res: Result<&str, ErrMode> => res,
231 }
232 }
233}
234
235// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
236pub(crate) const MLB_UNESCAPED: (
237 (u8, u8),
238 u8,
239 RangeInclusive<u8>,
240 RangeInclusive<u8>,
241 RangeInclusive<u8>,
242) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
243
244// mlb-escaped-nl = escape ws newline *( wschar / newline
245// When the last non-whitespace character on a line is a \,
246// it will be trimmed along with all whitespace
247// (including newlines) up to the next non-whitespace
248// character or closing delimiter.
249fn mlb_escaped_nl(input: &mut Input<'_>) -> PResult<()> {
250 repeatValue …, …), …, …, …, …>, …, …, …, …, …>, …, …, …, …>(1.., (ESCAPE, ws, ws_newlines))
251 .map(|()| ())
252 .value(())
253 .parse_next(input)
254}
255
256// ;; Literal String
257
258// literal-string = apostrophe *literal-char apostrophe
259pub(crate) fn literal_string<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
260 traceimpl Parser, …>(
261 name:"literal-string",
262 parser:delimited(
263 APOSTROPHE,
264 parser:cut_err(take_while(0.., LITERAL_CHAR)),
265 ignored2:cut_err(APOSTROPHE),
266 )
267 .try_map(std::str::from_utf8)
268 .context(StrContext::Label("literal string")),
269 )
270 .parse_next(input)
271}
272
273// apostrophe = %x27 ; ' apostrophe
274pub(crate) const APOSTROPHE: u8 = b'\'';
275
276// literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
277pub(crate) const LITERAL_CHAR: (
278 u8,
279 RangeInclusive<u8>,
280 RangeInclusive<u8>,
281 RangeInclusive<u8>,
282) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
283
284// ;; Multiline Literal String
285
286// ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body
287// ml-literal-string-delim
288fn ml_literal_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
289 traceimpl Parser, …>(
290 name:"ml-literal-string",
291 parser:delimited(
292 (ML_LITERAL_STRING_DELIM, opt(newline)),
293 parser:cut_err(ml_literal_body.map(|t| {
294 if t.contains("\r\n") {
295 Cow::Owned(t.replace("\r\n", "\n"))
296 } else {
297 Cow::Borrowed(t)
298 }
299 })),
300 ignored2:cut_err(ML_LITERAL_STRING_DELIM),
301 )
302 .context(StrContext::Label("multiline literal string")),
303 )
304 .parse_next(input)
305}
306
307// ml-literal-string-delim = 3apostrophe
308pub(crate) const ML_LITERAL_STRING_DELIM: &[u8] = b"'''";
309
310// ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ]
311fn ml_literal_body<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
312 (
313 repeat(occurrences:0.., parser:mll_content).map(|()| ()),
314 repeatRepeat<(impl Parser, …>, …), …, …, …, …>(
315 occurrences:0..,
316 (
317 mll_quotes(term:none_of(APOSTROPHE).value(())),
318 repeat(occurrences:1.., parser:mll_content).map(|()| ()),
319 ),
320 )
321 .map(|()| ()),
322 opt(parser:mll_quotes(ML_LITERAL_STRING_DELIM.void())),
323 )
324 .recognize()
325 .try_map(std::str::from_utf8)
326 .parse_next(input)
327}
328
329// mll-content = mll-char / newline
330fn mll_content(input: &mut Input<'_>) -> PResult<u8> {
331 alt((one_of(MLL_CHAR), newline)).parse_next(input)
332}
333
334// mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
335const MLL_CHAR: (
336 u8,
337 RangeInclusive<u8>,
338 RangeInclusive<u8>,
339 RangeInclusive<u8>,
340) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
341
342// mll-quotes = 1*2apostrophe
343fn mll_quotes<'i>(
344 mut term: impl winnow::Parser<Input<'i>, (), ContextError>,
345) -> impl Parser<Input<'i>, &'i str, ContextError> {
346 move |input: &mut Input<'i>| {
347 let start: Checkpoint, …> = input.checkpoint();
348 let res: Result<&str, ErrMode> = terminatedMap, …>, …, …, …, …, …>(parser:b"''", ignored:peek(parser:term.by_ref()))
349 .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII") })
350 .parse_next(input);
351
352 match res {
353 Err(winnow::error::ErrMode::Backtrack(_)) => {
354 input.reset(&start);
355 terminatedMap, …>, …, …, …, …, …>(parser:b"'", ignored:peek(parser:term.by_ref()))
356 .map(|b: &[u8]| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII") })
357 .parse_next(input)
358 }
359 res: Result<&str, ErrMode> => res,
360 }
361 }
362}
363
364#[cfg(test)]
365#[cfg(feature = "parse")]
366#[cfg(feature = "display")]
367mod test {
368 use super::*;
369
370 #[test]
371 fn basic_string() {
372 let input =
373 r#""I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF. \U0002070E""#;
374 let expected = "I\'m a string. \"You can quote me\". Name\tJosé\nLocation\tSF. \u{2070E}";
375 let parsed = string.parse(new_input(input));
376 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
377 }
378
379 #[test]
380 fn ml_basic_string() {
381 let cases = [
382 (
383 r#""""
384Roses are red
385Violets are blue""""#,
386 r#"Roses are red
387Violets are blue"#,
388 ),
389 (r#"""" \""" """"#, " \"\"\" "),
390 (r#"""" \\""""#, " \\"),
391 ];
392
393 for &(input, expected) in &cases {
394 let parsed = string.parse(new_input(input));
395 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
396 }
397
398 let invalid_cases = [r#"""" """#, r#"""" \""""#];
399
400 for input in &invalid_cases {
401 let parsed = string.parse(new_input(input));
402 assert!(parsed.is_err());
403 }
404 }
405
406 #[test]
407 fn ml_basic_string_escape_ws() {
408 let inputs = [
409 r#""""
410The quick brown \
411
412
413 fox jumps over \
414 the lazy dog.""""#,
415 r#""""\
416 The quick brown \
417 fox jumps over \
418 the lazy dog.\
419 """"#,
420 ];
421 for input in &inputs {
422 let expected = "The quick brown fox jumps over the lazy dog.";
423 let parsed = string.parse(new_input(input));
424 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
425 }
426 let empties = [
427 r#""""\
428 """"#,
429 r#""""
430\
431 \
432""""#,
433 ];
434 for input in &empties {
435 let expected = "";
436 let parsed = string.parse(new_input(input));
437 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
438 }
439 }
440
441 #[test]
442 fn literal_string() {
443 let inputs = [
444 r"'C:\Users\nodejs\templates'",
445 r"'\\ServerX\admin$\system32\'",
446 r#"'Tom "Dubs" Preston-Werner'"#,
447 r"'<\i\c*\s*>'",
448 ];
449
450 for input in &inputs {
451 let expected = &input[1..input.len() - 1];
452 let parsed = string.parse(new_input(input));
453 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
454 }
455 }
456
457 #[test]
458 fn ml_literal_string() {
459 let inputs = [
460 r"'''I [dw]on't need \d{2} apples'''",
461 r#"''''one_quote''''"#,
462 ];
463 for input in &inputs {
464 let expected = &input[3..input.len() - 3];
465 let parsed = string.parse(new_input(input));
466 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
467 }
468
469 let input = r#"'''
470The first newline is
471trimmed in raw strings.
472 All other whitespace
473 is preserved.
474'''"#;
475 let expected = &input[4..input.len() - 3];
476 let parsed = string.parse(new_input(input));
477 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
478 }
479}
480