strings.rs source code [crates/toml_edit-0.21.1/src/parser/strings.rs]

1	use std::borrow::Cow;
2	use std::char;
3	use std::ops::RangeInclusive;
4
5	use winnow::combinator::alt;
6	use winnow::combinator::cut_err;
7	use winnow::combinator::delimited;
8	use winnow::combinator::fail;
9	use winnow::combinator::opt;
10	use winnow::combinator::peek;
11	use winnow::combinator::preceded;
12	use winnow::combinator::repeat;
13	use winnow::combinator::success;
14	use winnow::combinator::terminated;
15	use winnow::prelude::*;
16	use winnow::stream::Stream;
17	use winnow::token::any;
18	use winnow::token::none_of;
19	use winnow::token::one_of;
20	use winnow::token::tag;
21	use winnow::token::take_while;
22	use winnow::trace::trace;
23
24	use crate::parser::error::CustomError;
25	use crate::parser::numbers::HEXDIG;
26	use crate::parser::prelude::*;
27	use crate::parser::trivia::{from_utf8_unchecked, newline, ws, ws_newlines, NON_ASCII, WSCHAR};
28
29	// ;; String
30
31	// string = ml-basic-string / basic-string / ml-literal-string / literal-string
32	pub(crate) fn string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
33	traceimpl Parser, …>(
34	name:"string",
35	parser:alt((
36	ml_basic_string,
37	basic_string,
38	ml_literal_string,
39	literal_string.map(Cow::Borrowed),
40	)),
41	)
42	.parse_next(input)
43	}
44
45	// ;; Basic String
46
47	// basic-string = quotation-mark basic-char quotation-mark*
48	pub(crate) fn basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
49	traceimpl Parser, …>(name:"basic-string", \|input: &mut Input<'i>\| {
50	let _ = one_of(QUOTATION_MARK).parse_next(input)?;
51
52	let mut c: Cow<'_, str> = Cow::Borrowed("");
53	if let Some(ci: Cow<'_, str>) = opt(basic_chars).parse_next(input)? {
54	c = ci;
55	}
56	while let Some(ci: Cow<'_, str>) = opt(basic_chars).parse_next(input)? {
57	c.to_mut().push_str(&ci);
58	}
59
60	let _ = cut_errContext, …>, …, …, …, …>(parser:one_of(QUOTATION_MARK))
61	.context(StrContext::Label("basic string"))
62	.parse_next(input)?;
63
64	Ok(c)
65	})
66	.parse_next(input)
67	}
68
69	// quotation-mark = %x22 ; "
70	pub(crate) const QUOTATION_MARK: u8 = b'"';
71
72	// basic-char = basic-unescaped / escaped
73	fn basic_chars<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
74	altimpl Parser, …>((
75	// Deviate from the official grammar by batching the unescaped chars so we build a string a
76	// chunk at a time, rather than a `char` at a time.
77	take_whileTryMap, …>, …, …, …, …, …, …>(range:`1`.., BASIC_UNESCAPED)
78	.try_map(std::str::from_utf8)
79	.map(Cow::Borrowed),
80	escaped.map(\|c: char\| Cow::Owned(String::from(c))),
81	))
82	.parse_next(input)
83	}
84
85	// basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
86	pub(crate) const BASIC_UNESCAPED: (
87	(u8, u8),
88	u8,
89	RangeInclusive<u8>,
90	RangeInclusive<u8>,
91	RangeInclusive<u8>,
92	) = (WSCHAR, `0x21`, `0x23`..=`0x5B`, `0x5D`..=`0x7E`, NON_ASCII);
93
94	// escaped = escape escape-seq-char
95	fn escaped(input: &mut Input<'_>) -> PResult<char> {
96	preceded(ESCAPE, second:escape_seq_char).parse_next(input)
97	}
98
99	// escape = %x5C ; \
100	pub(crate) const ESCAPE: u8 = b'`\\`';
101
102	// escape-seq-char = %x22 ; " quotation mark U+0022
103	// escape-seq-char =/ %x5C ; \ reverse solidus U+005C
104	// escape-seq-char =/ %x62 ; b backspace U+0008
105	// escape-seq-char =/ %x66 ; f form feed U+000C
106	// escape-seq-char =/ %x6E ; n line feed U+000A
107	// escape-seq-char =/ %x72 ; r carriage return U+000D
108	// escape-seq-char =/ %x74 ; t tab U+0009
109	// escape-seq-char =/ %x75 4HEXDIG ; uXXXX U+XXXX
110	// escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX U+XXXXXXXX
111	fn escape_seq_char(input: &mut Input<'_>) -> PResult<char> {
112	dispatch! {any;
113	b'b' => success('`\u{8}`'),
114	b'f' => success('`\u{c}`'),
115	b'n' => success('`\n`'),
116	b'r' => success('`\r`'),
117	b't' => success('`\t`'),
118	b'u' => cut_err(hexescape::<`4`>).context(StrContext::Label("unicode 4-digit hex code")),
119	b'U' => cut_err(hexescape::<`8`>).context(StrContext::Label("unicode 8-digit hex code")),
120	b'`\\`' => success('`\\`'),
121	b'"' => success('"'),
122	_ => {
123	cut_err(fail::<_, char, _>)
124	.context(StrContext::Label("escape sequence"))
125	.context(StrContext::Expected(StrContextValue::CharLiteral('b')))
126	.context(StrContext::Expected(StrContextValue::CharLiteral('f')))
127	.context(StrContext::Expected(StrContextValue::CharLiteral('n')))
128	.context(StrContext::Expected(StrContextValue::CharLiteral('r')))
129	.context(StrContext::Expected(StrContextValue::CharLiteral('t')))
130	.context(StrContext::Expected(StrContextValue::CharLiteral('u')))
131	.context(StrContext::Expected(StrContextValue::CharLiteral('U')))
132	.context(StrContext::Expected(StrContextValue::CharLiteral('`\\`')))
133	.context(StrContext::Expected(StrContextValue::CharLiteral('"')))
134	}
135	}
136	.parse_next(input)
137	}
138
139	pub(crate) fn hexescape<const N: usize>(input: &mut Input<'_>) -> PResult<char> {
140	take_whileTryMap, …, …, …, …, …>, …, …, …, …, …>, …, …, …, …, …, …>(`0`..=N, HEXDIG)
141	.verify(\|b: &[u8]\| b.len() == N)
142	.map(\|b: &[u8]\| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`is_ascii_digit` filters out on-ASCII") })
143	.verify_map(\|s: &str\| u32::from_str_radix(src:s, radix:`16`).ok())
144	.try_map(\|h: u32\| char::from_u32(h).ok_or(err:CustomError::OutOfRange))
145	.parse_next(input)
146	}
147
148	// ;; Multiline Basic String
149
150	// ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
151	// ml-basic-string-delim
152	fn ml_basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
153	traceimpl Parser, …>(
154	name:"ml-basic-string",
155	parser:delimited(
156	ML_BASIC_STRING_DELIM,
157	second:preceded(opt(newline), cut_err(ml_basic_body)),
158	third:cut_err(ML_BASIC_STRING_DELIM),
159	)
160	.context(StrContext::Label("multiline basic string")),
161	)
162	.parse_next(input)
163	}
164
165	// ml-basic-string-delim = 3quotation-mark
166	pub(crate) const ML_BASIC_STRING_DELIM: &[u8] = b"`\"\"\"`";
167
168	// ml-basic-body = mlb-content ( mlb-quotes 1mlb-content ) [ mlb-quotes ]*
169	fn ml_basic_body<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
170	let mut c = Cow::Borrowed("");
171	if let Some(ci) = opt(mlb_content).parse_next(input)? {
172	c = ci;
173	}
174	while let Some(ci) = opt(mlb_content).parse_next(input)? {
175	c.to_mut().push_str(&ci);
176	}
177
178	while let Some(qi) = opt(mlb_quotes(none_of(b'`\"`').value(()))).parse_next(input)? {
179	if let Some(ci) = opt(mlb_content).parse_next(input)? {
180	c.to_mut().push_str(qi);
181	c.to_mut().push_str(&ci);
182	while let Some(ci) = opt(mlb_content).parse_next(input)? {
183	c.to_mut().push_str(&ci);
184	}
185	} else {
186	break;
187	}
188	}
189
190	if let Some(qi) = opt(mlb_quotes(tag(ML_BASIC_STRING_DELIM).value(()))).parse_next(input)? {
191	c.to_mut().push_str(qi);
192	}
193
194	Ok(c)
195	}
196
197	// mlb-content = mlb-char / newline / mlb-escaped-nl
198	// mlb-char = mlb-unescaped / escaped
199	fn mlb_content<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
200	altimpl Parser, …>((
201	// Deviate from the official grammar by batching the unescaped chars so we build a string a
202	// chunk at a time, rather than a `char` at a time.
203	take_whileTryMap, …>, …, …, …, …, …, …>(range:`1`.., MLB_UNESCAPED)
204	.try_map(std::str::from_utf8)
205	.map(Cow::Borrowed),
206	// Order changed fromg grammar so `escaped` can more easily `cut_err` on bad escape sequences
207	mlb_escaped_nl.map(\|_\| Cow::Borrowed("")),
208	escaped.map(\|c: char\| Cow::Owned(String::from(c))),
209	newline.map(\|_\| Cow::Borrowed("`\n`")),
210	))
211	.parse_next(input)
212	}
213
214	// mlb-quotes = 12quotation-mark*
215	fn mlb_quotes<'i>(
216	mut term: impl winnow::Parser<Input<'i>, (), ContextError>,
217	) -> impl Parser<Input<'i>, &'i str, ContextError> {
218	move \|input: &mut Input<'i>\| {
219	let start: Checkpoint> = input.checkpoint();
220	let res: Result<&str, ErrMode> = terminatedMap, …>, …, …, …, …, …>(first:b"`\"\"`", second:peek(term.by_ref()))
221	.map(\|b: &[u8]\| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII") })
222	.parse_next(input);
223
224	match res {
225	Err(winnow::error::ErrMode::Backtrack(_)) => {
226	input.reset(checkpoint:start);
227	terminatedMap, …>, …, …, …, …, …>(first:b"`\"`", second:peek(term.by_ref()))
228	.map(\|b: &[u8]\| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII") })
229	.parse_next(input)
230	}
231	res: Result<&str, ErrMode> => res,
232	}
233	}
234	}
235
236	// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
237	pub(crate) const MLB_UNESCAPED: (
238	(u8, u8),
239	u8,
240	RangeInclusive<u8>,
241	RangeInclusive<u8>,
242	RangeInclusive<u8>,
243	) = (WSCHAR, `0x21`, `0x23`..=`0x5B`, `0x5D`..=`0x7E`, NON_ASCII);
244
245	// mlb-escaped-nl = escape ws newline ( wschar / newline*
246	// When the last non-whitespace character on a line is a \,
247	// it will be trimmed along with all whitespace
248	// (including newlines) up to the next non-whitespace
249	// character or closing delimiter.
250	fn mlb_escaped_nl(input: &mut Input<'_>) -> PResult<()> {
251	repeatValue …, …), …, …, …, …>, …, …, …, …, …>, …, …, …, …>(`1`.., (ESCAPE, ws, ws_newlines))
252	.map(\|()\| ())
253	.value(())
254	.parse_next(input)
255	}
256
257	// ;; Literal String
258
259	// literal-string = apostrophe literal-char apostrophe*
260	pub(crate) fn literal_string<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
261	traceimpl Parser, …>(
262	name:"literal-string",
263	parser:delimited(
264	APOSTROPHE,
265	second:cut_err(take_while(`0`.., LITERAL_CHAR)),
266	third:cut_err(APOSTROPHE),
267	)
268	.try_map(std::str::from_utf8)
269	.context(StrContext::Label("literal string")),
270	)
271	.parse_next(input)
272	}
273
274	// apostrophe = %x27 ; ' apostrophe
275	pub(crate) const APOSTROPHE: u8 = b'`\'`';
276
277	// literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
278	pub(crate) const LITERAL_CHAR: (
279	u8,
280	RangeInclusive<u8>,
281	RangeInclusive<u8>,
282	RangeInclusive<u8>,
283	) = (`0x9`, `0x20`..=`0x26`, `0x28`..=`0x7E`, NON_ASCII);
284
285	// ;; Multiline Literal String
286
287	// ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body
288	// ml-literal-string-delim
289	fn ml_literal_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
290	traceimpl Parser, …>(
291	name:"ml-literal-string",
292	parser:delimited(
293	(ML_LITERAL_STRING_DELIM, opt(newline)),
294	second:cut_err(ml_literal_body.map(\|t\| {
295	if t.contains("`\r\n`") {
296	Cow::Owned(t.replace("`\r\n`", "`\n`"))
297	} else {
298	Cow::Borrowed(t)
299	}
300	})),
301	third:cut_err(ML_LITERAL_STRING_DELIM),
302	)
303	.context(StrContext::Label("multiline literal string")),
304	)
305	.parse_next(input)
306	}
307
308	// ml-literal-string-delim = 3apostrophe
309	pub(crate) const ML_LITERAL_STRING_DELIM: &[u8] = b"'''";
310
311	// ml-literal-body = mll-content ( mll-quotes 1mll-content ) [ mll-quotes ]*
312	fn ml_literal_body<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
313	(
314	repeat(range:`0`.., parser:mll_content).map(\|()\| ()),
315	repeatRepeat<(impl Parser, …>, …), …, …, …, …>(
316	range:`0`..,
317	(
318	mll_quotes(term:none_of(APOSTROPHE).value(())),
319	repeat(range:`1`.., parser:mll_content).map(\|()\| ()),
320	),
321	)
322	.map(\|()\| ()),
323	opt(mll_quotes(term:tag(ML_LITERAL_STRING_DELIM).value(()))),
324	)
325	.recognize()
326	.try_map(std::str::from_utf8)
327	.parse_next(input)
328	}
329
330	// mll-content = mll-char / newline
331	fn mll_content(input: &mut Input<'_>) -> PResult<u8> {
332	alt((one_of(MLL_CHAR), newline)).parse_next(input)
333	}
334
335	// mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
336	const MLL_CHAR: (
337	u8,
338	RangeInclusive<u8>,
339	RangeInclusive<u8>,
340	RangeInclusive<u8>,
341	) = (`0x9`, `0x20`..=`0x26`, `0x28`..=`0x7E`, NON_ASCII);
342
343	// mll-quotes = 12apostrophe*
344	fn mll_quotes<'i>(
345	mut term: impl winnow::Parser<Input<'i>, (), ContextError>,
346	) -> impl Parser<Input<'i>, &'i str, ContextError> {
347	move \|input: &mut Input<'i>\| {
348	let start: Checkpoint> = input.checkpoint();
349	let res: Result<&str, ErrMode> = terminatedMap, …>, …, …, …, …, …>(first:b"''", second:peek(term.by_ref()))
350	.map(\|b: &[u8]\| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII") })
351	.parse_next(input);
352
353	match res {
354	Err(winnow::error::ErrMode::Backtrack(_)) => {
355	input.reset(checkpoint:start);
356	terminatedMap, …>, …, …, …, …, …>(first:b"'", second:peek(term.by_ref()))
357	.map(\|b: &[u8]\| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII") })
358	.parse_next(input)
359	}
360	res: Result<&str, ErrMode> => res,
361	}
362	}
363	}
364
365	#[cfg(test)]
366	#[cfg(feature = "parse")]
367	#[cfg(feature = "display")]
368	mod test {
369	use super::*;
370
371	#[test]
372	fn basic_string() {
373	let input =
374	r#""I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF. \U0002070E""#;
375	let expected = "I`\'`m a string. `\"`You can quote me`\"`. Name`\t`José`\n`Location`\t`SF. `\u{2070E}`";
376	let parsed = string.parse(new_input(input));
377	assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
378	}
379
380	#[test]
381	fn ml_basic_string() {
382	let cases = [
383	(
384	r#""""
385	Roses are red
386	Violets are blue""""#,
387	r#"Roses are red
388	Violets are blue"#,
389	),
390	(r#"""" \""" """"#, " `\"\"\"` "),
391	(r#"""" \\""""#, " `\\`"),
392	];
393
394	for &(input, expected) in &cases {
395	let parsed = string.parse(new_input(input));
396	assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
397	}
398
399	let invalid_cases = [r#"""" """#, r#"""" \""""#];
400
401	for input in &invalid_cases {
402	let parsed = string.parse(new_input(input));
403	assert!(parsed.is_err());
404	}
405	}
406
407	#[test]
408	fn ml_basic_string_escape_ws() {
409	let inputs = [
410	r#""""
411	The quick brown \
412
413
414	fox jumps over \
415	the lazy dog.""""#,
416	r#""""\
417	The quick brown \
418	fox jumps over \
419	the lazy dog.\
420	""""#,
421	];
422	for input in &inputs {
423	let expected = "The quick brown fox jumps over the lazy dog.";
424	let parsed = string.parse(new_input(input));
425	assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
426	}
427	let empties = [
428	r#""""\
429	""""#,
430	r#""""
431	\
432	\
433	""""#,
434	];
435	for input in &empties {
436	let expected = "";
437	let parsed = string.parse(new_input(input));
438	assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
439	}
440	}
441
442	#[test]
443	fn literal_string() {
444	let inputs = [
445	r"'C:\Users\nodejs\templates'",
446	r"'\\ServerX\admin$\system32\'",
447	r#"'Tom "Dubs" Preston-Werner'"#,
448	r"'<\i\c\s>'",
449	];
450
451	for input in &inputs {
452	let expected = &input[`1`..input.len() - `1`];
453	let parsed = string.parse(new_input(input));
454	assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
455	}
456	}
457
458	#[test]
459	fn ml_literal_string() {
460	let inputs = [
461	r"'''I [dw]on't need \d{2} apples'''",
462	r#"''''one_quote''''"#,
463	];
464	for input in &inputs {
465	let expected = &input[`3`..input.len() - `3`];
466	let parsed = string.parse(new_input(input));
467	assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
468	}
469
470	let input = r#"'''
471	The first newline is
472	trimmed in raw strings.
473	All other whitespace
474	is preserved.
475	'''"#;
476	let expected = &input[`4`..input.len() - `3`];
477	let parsed = string.parse(new_input(input));
478	assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
479	}
480	}
481