strings.rs source code [crates/toml_edit-0.22.8/src/parser/strings.rs]

1	use std::borrow::Cow;
2	use std::char;
3	use std::ops::RangeInclusive;
4
5	use winnow::combinator::alt;
6	use winnow::combinator::cut_err;
7	use winnow::combinator::delimited;
8	use winnow::combinator::empty;
9	use winnow::combinator::fail;
10	use winnow::combinator::opt;
11	use winnow::combinator::peek;
12	use winnow::combinator::preceded;
13	use winnow::combinator::repeat;
14	use winnow::combinator::terminated;
15	use winnow::combinator::trace;
16	use winnow::prelude::*;
17	use winnow::stream::Stream;
18	use winnow::token::any;
19	use winnow::token::none_of;
20	use winnow::token::one_of;
21	use winnow::token::take_while;
22
23	use crate::parser::error::CustomError;
24	use crate::parser::numbers::HEXDIG;
25	use crate::parser::prelude::*;
26	use crate::parser::trivia::{from_utf8_unchecked, newline, ws, ws_newlines, NON_ASCII, WSCHAR};
27
28	// ;; String
29
30	// string = ml-basic-string / basic-string / ml-literal-string / literal-string
31	pub(crate) fn string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
32	traceimpl Parser, …>(
33	name:"string",
34	parser:alt((
35	ml_basic_string,
36	basic_string,
37	ml_literal_string,
38	literal_string.map(Cow::Borrowed),
39	)),
40	)
41	.parse_next(input)
42	}
43
44	// ;; Basic String
45
46	// basic-string = quotation-mark basic-char quotation-mark*
47	pub(crate) fn basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
48	traceimpl Parser, …>(name:"basic-string", \|input: &mut Input<'i>\| {
49	let _ = one_of(QUOTATION_MARK).parse_next(input)?;
50
51	let mut c: Cow<'_, str> = Cow::Borrowed("");
52	if let Some(ci: Cow<'_, str>) = opt(parser:basic_chars).parse_next(input)? {
53	c = ci;
54	}
55	while let Some(ci: Cow<'_, str>) = opt(parser:basic_chars).parse_next(input)? {
56	c.to_mut().push_str(&ci);
57	}
58
59	let _ = cut_errContext, …>, …, …, …, …>(parser:one_of(QUOTATION_MARK))
60	.context(StrContext::Label("basic string"))
61	.parse_next(input)?;
62
63	Ok(c)
64	})
65	.parse_next(input)
66	}
67
68	// quotation-mark = %x22 ; "
69	pub(crate) const QUOTATION_MARK: u8 = b'"';
70
71	// basic-char = basic-unescaped / escaped
72	fn basic_chars<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
73	altimpl Parser, …>((
74	// Deviate from the official grammar by batching the unescaped chars so we build a string a
75	// chunk at a time, rather than a `char` at a time.
76	take_whileTryMap, …>, …, …, …, …, …, …>(occurrences:`1`.., BASIC_UNESCAPED)
77	.try_map(std::str::from_utf8)
78	.map(Cow::Borrowed),
79	escaped.map(\|c: char\| Cow::Owned(String::from(c))),
80	))
81	.parse_next(input)
82	}
83
84	// basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
85	pub(crate) const BASIC_UNESCAPED: (
86	(u8, u8),
87	u8,
88	RangeInclusive<u8>,
89	RangeInclusive<u8>,
90	RangeInclusive<u8>,
91	) = (WSCHAR, `0x21`, `0x23`..=`0x5B`, `0x5D`..=`0x7E`, NON_ASCII);
92
93	// escaped = escape escape-seq-char
94	fn escaped(input: &mut Input<'_>) -> PResult<char> {
95	preceded(ESCAPE, parser:escape_seq_char).parse_next(input)
96	}
97
98	// escape = %x5C ; \
99	pub(crate) const ESCAPE: u8 = b'`\\`';
100
101	// escape-seq-char = %x22 ; " quotation mark U+0022
102	// escape-seq-char =/ %x5C ; \ reverse solidus U+005C
103	// escape-seq-char =/ %x62 ; b backspace U+0008
104	// escape-seq-char =/ %x66 ; f form feed U+000C
105	// escape-seq-char =/ %x6E ; n line feed U+000A
106	// escape-seq-char =/ %x72 ; r carriage return U+000D
107	// escape-seq-char =/ %x74 ; t tab U+0009
108	// escape-seq-char =/ %x75 4HEXDIG ; uXXXX U+XXXX
109	// escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX U+XXXXXXXX
110	fn escape_seq_char(input: &mut Input<'_>) -> PResult<char> {
111	dispatch! {any;
112	b'b' => empty.value('`\u{8}`'),
113	b'f' => empty.value('`\u{c}`'),
114	b'n' => empty.value('`\n`'),
115	b'r' => empty.value('`\r`'),
116	b't' => empty.value('`\t`'),
117	b'u' => cut_err(hexescape::<`4`>).context(StrContext::Label("unicode 4-digit hex code")),
118	b'U' => cut_err(hexescape::<`8`>).context(StrContext::Label("unicode 8-digit hex code")),
119	b'`\\`' => empty.value('`\\`'),
120	b'"' => empty.value('"'),
121	_ => {
122	cut_err(fail::<_, char, _>)
123	.context(StrContext::Label("escape sequence"))
124	.context(StrContext::Expected(StrContextValue::CharLiteral('b')))
125	.context(StrContext::Expected(StrContextValue::CharLiteral('f')))
126	.context(StrContext::Expected(StrContextValue::CharLiteral('n')))
127	.context(StrContext::Expected(StrContextValue::CharLiteral('r')))
128	.context(StrContext::Expected(StrContextValue::CharLiteral('t')))
129	.context(StrContext::Expected(StrContextValue::CharLiteral('u')))
130	.context(StrContext::Expected(StrContextValue::CharLiteral('U')))
131	.context(StrContext::Expected(StrContextValue::CharLiteral('`\\`')))
132	.context(StrContext::Expected(StrContextValue::CharLiteral('"')))
133	}
134	}
135	.parse_next(input)
136	}
137
138	pub(crate) fn hexescape<const N: usize>(input: &mut Input<'_>) -> PResult<char> {
139	take_whileTryMap, …, …, …, …, …>, …, …, …, …, …>, …, …, …, …, …, …>(`0`..=N, HEXDIG)
140	.verify(\|b: &[u8]\| b.len() == N)
141	.map(\|b: &[u8]\| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`is_ascii_digit` filters out on-ASCII") })
142	.verify_map(\|s: &str\| u32::from_str_radix(src:s, radix:`16`).ok())
143	.try_map(\|h: u32\| char::from_u32(h).ok_or(err:CustomError::OutOfRange))
144	.parse_next(input)
145	}
146
147	// ;; Multiline Basic String
148
149	// ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
150	// ml-basic-string-delim
151	fn ml_basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
152	traceimpl Parser, …>(
153	name:"ml-basic-string",
154	parser:delimited(
155	ML_BASIC_STRING_DELIM,
156	parser:preceded(opt(newline), cut_err(ml_basic_body)),
157	ignored2:cut_err(ML_BASIC_STRING_DELIM),
158	)
159	.context(StrContext::Label("multiline basic string")),
160	)
161	.parse_next(input)
162	}
163
164	// ml-basic-string-delim = 3quotation-mark
165	pub(crate) const ML_BASIC_STRING_DELIM: &[u8] = b"`\"\"\"`";
166
167	// ml-basic-body = mlb-content ( mlb-quotes 1mlb-content ) [ mlb-quotes ]*
168	fn ml_basic_body<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
169	let mut c = Cow::Borrowed("");
170	if let Some(ci) = opt(mlb_content).parse_next(input)? {
171	c = ci;
172	}
173	while let Some(ci) = opt(mlb_content).parse_next(input)? {
174	c.to_mut().push_str(&ci);
175	}
176
177	while let Some(qi) = opt(mlb_quotes(none_of(b'`\"`').value(()))).parse_next(input)? {
178	if let Some(ci) = opt(mlb_content).parse_next(input)? {
179	c.to_mut().push_str(qi);
180	c.to_mut().push_str(&ci);
181	while let Some(ci) = opt(mlb_content).parse_next(input)? {
182	c.to_mut().push_str(&ci);
183	}
184	} else {
185	break;
186	}
187	}
188
189	if let Some(qi) = opt(mlb_quotes(ML_BASIC_STRING_DELIM.void())).parse_next(input)? {
190	c.to_mut().push_str(qi);
191	}
192
193	Ok(c)
194	}
195
196	// mlb-content = mlb-char / newline / mlb-escaped-nl
197	// mlb-char = mlb-unescaped / escaped
198	fn mlb_content<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
199	altimpl Parser, …>((
200	// Deviate from the official grammar by batching the unescaped chars so we build a string a
201	// chunk at a time, rather than a `char` at a time.
202	take_whileTryMap, …>, …, …, …, …, …, …>(occurrences:`1`.., MLB_UNESCAPED)
203	.try_map(std::str::from_utf8)
204	.map(Cow::Borrowed),
205	// Order changed fromg grammar so `escaped` can more easily `cut_err` on bad escape sequences
206	mlb_escaped_nl.map(\|_\| Cow::Borrowed("")),
207	escaped.map(\|c: char\| Cow::Owned(String::from(c))),
208	newline.map(\|_\| Cow::Borrowed("`\n`")),
209	))
210	.parse_next(input)
211	}
212
213	// mlb-quotes = 12quotation-mark*
214	fn mlb_quotes<'i>(
215	mut term: impl winnow::Parser<Input<'i>, (), ContextError>,
216	) -> impl Parser<Input<'i>, &'i str, ContextError> {
217	move \|input: &mut Input<'i>\| {
218	let start: Checkpoint, …> = input.checkpoint();
219	let res: Result<&str, ErrMode> = terminatedMap, …>, …, …, …, …, …>(parser:b"`\"\"`", ignored:peek(parser:term.by_ref()))
220	.map(\|b: &[u8]\| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII") })
221	.parse_next(input);
222
223	match res {
224	Err(winnow::error::ErrMode::Backtrack(_)) => {
225	input.reset(&start);
226	terminatedMap, …>, …, …, …, …, …>(parser:b"`\"`", ignored:peek(parser:term.by_ref()))
227	.map(\|b: &[u8]\| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII") })
228	.parse_next(input)
229	}
230	res: Result<&str, ErrMode> => res,
231	}
232	}
233	}
234
235	// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
236	pub(crate) const MLB_UNESCAPED: (
237	(u8, u8),
238	u8,
239	RangeInclusive<u8>,
240	RangeInclusive<u8>,
241	RangeInclusive<u8>,
242	) = (WSCHAR, `0x21`, `0x23`..=`0x5B`, `0x5D`..=`0x7E`, NON_ASCII);
243
244	// mlb-escaped-nl = escape ws newline ( wschar / newline*
245	// When the last non-whitespace character on a line is a \,
246	// it will be trimmed along with all whitespace
247	// (including newlines) up to the next non-whitespace
248	// character or closing delimiter.
249	fn mlb_escaped_nl(input: &mut Input<'_>) -> PResult<()> {
250	repeatValue …, …), …, …, …, …>, …, …, …, …, …>, …, …, …, …>(`1`.., (ESCAPE, ws, ws_newlines))
251	.map(\|()\| ())
252	.value(())
253	.parse_next(input)
254	}
255
256	// ;; Literal String
257
258	// literal-string = apostrophe literal-char apostrophe*
259	pub(crate) fn literal_string<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
260	traceimpl Parser, …>(
261	name:"literal-string",
262	parser:delimited(
263	APOSTROPHE,
264	parser:cut_err(take_while(`0`.., LITERAL_CHAR)),
265	ignored2:cut_err(APOSTROPHE),
266	)
267	.try_map(std::str::from_utf8)
268	.context(StrContext::Label("literal string")),
269	)
270	.parse_next(input)
271	}
272
273	// apostrophe = %x27 ; ' apostrophe
274	pub(crate) const APOSTROPHE: u8 = b'`\'`';
275
276	// literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
277	pub(crate) const LITERAL_CHAR: (
278	u8,
279	RangeInclusive<u8>,
280	RangeInclusive<u8>,
281	RangeInclusive<u8>,
282	) = (`0x9`, `0x20`..=`0x26`, `0x28`..=`0x7E`, NON_ASCII);
283
284	// ;; Multiline Literal String
285
286	// ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body
287	// ml-literal-string-delim
288	fn ml_literal_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
289	traceimpl Parser, …>(
290	name:"ml-literal-string",
291	parser:delimited(
292	(ML_LITERAL_STRING_DELIM, opt(newline)),
293	parser:cut_err(ml_literal_body.map(\|t\| {
294	if t.contains("`\r\n`") {
295	Cow::Owned(t.replace("`\r\n`", "`\n`"))
296	} else {
297	Cow::Borrowed(t)
298	}
299	})),
300	ignored2:cut_err(ML_LITERAL_STRING_DELIM),
301	)
302	.context(StrContext::Label("multiline literal string")),
303	)
304	.parse_next(input)
305	}
306
307	// ml-literal-string-delim = 3apostrophe
308	pub(crate) const ML_LITERAL_STRING_DELIM: &[u8] = b"'''";
309
310	// ml-literal-body = mll-content ( mll-quotes 1mll-content ) [ mll-quotes ]*
311	fn ml_literal_body<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
312	(
313	repeat(occurrences:`0`.., parser:mll_content).map(\|()\| ()),
314	repeatRepeat<(impl Parser, …>, …), …, …, …, …>(
315	occurrences:`0`..,
316	(
317	mll_quotes(term:none_of(APOSTROPHE).value(())),
318	repeat(occurrences:`1`.., parser:mll_content).map(\|()\| ()),
319	),
320	)
321	.map(\|()\| ()),
322	opt(parser:mll_quotes(ML_LITERAL_STRING_DELIM.void())),
323	)
324	.recognize()
325	.try_map(std::str::from_utf8)
326	.parse_next(input)
327	}
328
329	// mll-content = mll-char / newline
330	fn mll_content(input: &mut Input<'_>) -> PResult<u8> {
331	alt((one_of(MLL_CHAR), newline)).parse_next(input)
332	}
333
334	// mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
335	const MLL_CHAR: (
336	u8,
337	RangeInclusive<u8>,
338	RangeInclusive<u8>,
339	RangeInclusive<u8>,
340	) = (`0x9`, `0x20`..=`0x26`, `0x28`..=`0x7E`, NON_ASCII);
341
342	// mll-quotes = 12apostrophe*
343	fn mll_quotes<'i>(
344	mut term: impl winnow::Parser<Input<'i>, (), ContextError>,
345	) -> impl Parser<Input<'i>, &'i str, ContextError> {
346	move \|input: &mut Input<'i>\| {
347	let start: Checkpoint, …> = input.checkpoint();
348	let res: Result<&str, ErrMode> = terminatedMap, …>, …, …, …, …, …>(parser:b"''", ignored:peek(parser:term.by_ref()))
349	.map(\|b: &[u8]\| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII") })
350	.parse_next(input);
351
352	match res {
353	Err(winnow::error::ErrMode::Backtrack(_)) => {
354	input.reset(&start);
355	terminatedMap, …>, …, …, …, …, …>(parser:b"'", ignored:peek(parser:term.by_ref()))
356	.map(\|b: &[u8]\| unsafe { from_utf8_unchecked(bytes:b, safety_justification:"`bytes` out non-ASCII") })
357	.parse_next(input)
358	}
359	res: Result<&str, ErrMode> => res,
360	}
361	}
362	}
363
364	#[cfg(test)]
365	#[cfg(feature = "parse")]
366	#[cfg(feature = "display")]
367	mod test {
368	use super::*;
369
370	#[test]
371	fn basic_string() {
372	let input =
373	r#""I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF. \U0002070E""#;
374	let expected = "I`\'`m a string. `\"`You can quote me`\"`. Name`\t`José`\n`Location`\t`SF. `\u{2070E}`";
375	let parsed = string.parse(new_input(input));
376	assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
377	}
378
379	#[test]
380	fn ml_basic_string() {
381	let cases = [
382	(
383	r#""""
384	Roses are red
385	Violets are blue""""#,
386	r#"Roses are red
387	Violets are blue"#,
388	),
389	(r#"""" \""" """"#, " `\"\"\"` "),
390	(r#"""" \\""""#, " `\\`"),
391	];
392
393	for &(input, expected) in &cases {
394	let parsed = string.parse(new_input(input));
395	assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
396	}
397
398	let invalid_cases = [r#"""" """#, r#"""" \""""#];
399
400	for input in &invalid_cases {
401	let parsed = string.parse(new_input(input));
402	assert!(parsed.is_err());
403	}
404	}
405
406	#[test]
407	fn ml_basic_string_escape_ws() {
408	let inputs = [
409	r#""""
410	The quick brown \
411
412
413	fox jumps over \
414	the lazy dog.""""#,
415	r#""""\
416	The quick brown \
417	fox jumps over \
418	the lazy dog.\
419	""""#,
420	];
421	for input in &inputs {
422	let expected = "The quick brown fox jumps over the lazy dog.";
423	let parsed = string.parse(new_input(input));
424	assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
425	}
426	let empties = [
427	r#""""\
428	""""#,
429	r#""""
430	\
431	\
432	""""#,
433	];
434	for input in &empties {
435	let expected = "";
436	let parsed = string.parse(new_input(input));
437	assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
438	}
439	}
440
441	#[test]
442	fn literal_string() {
443	let inputs = [
444	r"'C:\Users\nodejs\templates'",
445	r"'\\ServerX\admin$\system32\'",
446	r#"'Tom "Dubs" Preston-Werner'"#,
447	r"'<\i\c\s>'",
448	];
449
450	for input in &inputs {
451	let expected = &input[`1`..input.len() - `1`];
452	let parsed = string.parse(new_input(input));
453	assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
454	}
455	}
456
457	#[test]
458	fn ml_literal_string() {
459	let inputs = [
460	r"'''I [dw]on't need \d{2} apples'''",
461	r#"''''one_quote''''"#,
462	];
463	for input in &inputs {
464	let expected = &input[`3`..input.len() - `3`];
465	let parsed = string.parse(new_input(input));
466	assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
467	}
468
469	let input = r#"'''
470	The first newline is
471	trimmed in raw strings.
472	All other whitespace
473	is preserved.
474	'''"#;
475	let expected = &input[`4`..input.len() - `3`];
476	let parsed = string.parse(new_input(input));
477	assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
478	}
479	}
480