bytes.rs source code [crates/shlex/src/bytes.rs]

1	// Copyright 2015 Nicholas Allegra (comex).
2	// Licensed under the Apache License, Version 2.0 <https://www.apache.org/licenses/LICENSE-2.0> or
3	// the MIT license <https://opensource.org/licenses/MIT>, at your option. This file may not be
4	// copied, modified, or distributed except according to those terms.
5
6	//! [`Shlex`] and friends for byte strings.
7	//!
8	//! This is used internally by the [outer module](crate), and may be more
9	//! convenient if you are working with byte slices (`[u8]`) or types that are
10	//! wrappers around bytes, such as [`OsStr`](std::ffi::OsStr):
11	//!
12	//! ```rust
13	//! #[cfg(unix)] {
14	//! use shlex::bytes::quote;
15	//! use std::ffi::OsStr;
16	//! use std::os::unix::ffi::OsStrExt;
17	//!
18	//! // `\x80` is invalid in UTF-8.
19	//! let os_str = OsStr::from_bytes(b"a`\x80`b c");
20	//! assert_eq!(quote(os_str.as_bytes()), &b"'a`\x80`b c'"[..]);
21	//! }
22	//! ```
23	//!
24	//! (On Windows, `OsStr` uses 16 bit wide characters so this will not work.)
25
26	extern crate alloc;
27	use alloc::vec::Vec;
28	use alloc::borrow::Cow;
29	#[cfg(test)]
30	use alloc::vec;
31	#[cfg(test)]
32	use alloc::borrow::ToOwned;
33	#[cfg(all(doc, not(doctest)))]
34	use crate::{self as shlex, quoting_warning};
35
36	use super::QuoteError;
37
38	/// An iterator that takes an input byte string and splits it into the words using the same syntax as
39	/// the POSIX shell.
40	pub struct Shlex<'a> {
41	in_iter: core::slice::Iter<'a, u8>,
42	/// The number of newlines read so far, plus one.
43	pub line_no: usize,
44	/// An input string is erroneous if it ends while inside a quotation or right after an
45	/// unescaped backslash. Since Iterator does not have a mechanism to return an error, if that
46	/// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to
47	/// true; best to check it after you're done iterating.
48	pub had_error: bool,
49	}
50
51	impl<'a> Shlex<'a> {
52	pub fn new(in_bytes: &'a [u8]) -> Self {
53	Shlex {
54	in_iter: in_bytes.iter(),
55	line_no: `1`,
56	had_error: `false`,
57	}
58	}
59
60	fn parse_word(&mut self, mut ch: u8) -> Option<Vec<u8>> {
61	let mut result: Vec<u8> = Vec::new();
62	loop {
63	match ch as char {
64	'"' => if let Err(()) = self.parse_double(&mut result) {
65	self.had_error = `true`;
66	return None;
67	},
68	'`\'`' => if let Err(()) = self.parse_single(&mut result) {
69	self.had_error = `true`;
70	return None;
71	},
72	'`\\`' => if let Some(ch2) = self.next_char() {
73	if ch2 != '`\n`' as u8 { result.push(ch2); }
74	} else {
75	self.had_error = `true`;
76	return None;
77	},
78	' ' \| '`\t`' \| '`\n`' => { break; },
79	_ => { result.push(ch as u8); },
80	}
81	if let Some(ch2) = self.next_char() { ch = ch2; } else { break; }
82	}
83	Some(result)
84	}
85
86	fn parse_double(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
87	loop {
88	if let Some(ch2) = self.next_char() {
89	match ch2 as char {
90	'`\\`' => {
91	if let Some(ch3) = self.next_char() {
92	match ch3 as char {
93	// \$ => $
94	'$' \| '`' \| '"' \| '`\\`' => { result.push(ch3); },
95	// \<newline> => nothing
96	'`\n`' => {},
97	// \x => =x
98	_ => { result.push('`\\`' as u8); result.push(ch3); }
99	}
100	} else {
101	return Err(());
102	}
103	},
104	'"' => { return Ok(()); },
105	_ => { result.push(ch2); },
106	}
107	} else {
108	return Err(());
109	}
110	}
111	}
112
113	fn parse_single(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
114	loop {
115	if let Some(ch2) = self.next_char() {
116	match ch2 as char {
117	'`\'`' => { return Ok(()); },
118	_ => { result.push(ch2); },
119	}
120	} else {
121	return Err(());
122	}
123	}
124	}
125
126	fn next_char(&mut self) -> Option<u8> {
127	let res = self.in_iter.next().copied();
128	if res == Some(b'`\n`') { self.line_no += `1`; }
129	res
130	}
131	}
132
133	impl<'a> Iterator for Shlex<'a> {
134	type Item = Vec<u8>;
135	fn next(&mut self) -> Option<Self::Item> {
136	if let Some(mut ch: u8) = self.next_char() {
137	// skip initial whitespace
138	loop {
139	match ch as char {
140	' ' \| '`\t`' \| '`\n`' => {},
141	'#' => {
142	while let Some(ch2: u8) = self.next_char() {
143	if ch2 as char == '`\n`' { break; }
144	}
145	},
146	_ => { break; }
147	}
148	if let Some(ch2: u8) = self.next_char() { ch = ch2; } else { return None; }
149	}
150	self.parse_word(ch)
151	} else { // no initial character
152	None
153	}
154	}
155
156	}
157
158	/// Convenience function that consumes the whole byte string at once. Returns None if the input was
159	/// erroneous.
160	pub fn split(in_bytes: &[u8]) -> Option<Vec<Vec<u8>>> {
161	let mut shl: Shlex<'_> = Shlex::new(in_bytes);
162	let res: Vec> = shl.by_ref().collect();
163	if shl.had_error { None } else { Some(res) }
164	}
165
166	/// A more configurable interface to quote strings. If you only want the default settings you can
167	/// use the convenience functions [`try_quote`] and [`try_join`].
168	///
169	/// The string equivalent is [`shlex::Quoter`].
170	#[derive(Default, Debug, Clone)]
171	pub struct Quoter {
172	allow_nul: bool,
173	// TODO: more options
174	}
175
176	impl Quoter {
177	/// Create a new [`Quoter`] with default settings.
178	#[inline]
179	pub fn new() -> Self {
180	Self::default()
181	}
182
183	/// Set whether to allow [nul bytes](quoting_warning#nul-bytes). By default they are not
184	/// allowed and will result in an error of [`QuoteError::Nul`].
185	#[inline]
186	pub fn allow_nul(mut self, allow: bool) -> Self {
187	self.allow_nul = allow;
188	self
189	}
190
191	/// Convenience function that consumes an iterable of words and turns it into a single byte string,
192	/// quoting words when necessary. Consecutive words will be separated by a single space.
193	pub fn join<'a, I: IntoIterator<Item = &'a [u8]>>(&self, words: I) -> Result<Vec<u8>, QuoteError> {
194	Ok(words.into_iter()
195	.map(\|word\| self.quote(word))
196	.collect::<Result<Vec<Cow<[u8]>>, QuoteError>>()?
197	.join(&b' '))
198	}
199
200	/// Given a single word, return a byte string suitable to encode it as a shell argument.
201	///
202	/// If given valid UTF-8, this will never produce invalid UTF-8. This is because it only
203	/// ever inserts valid ASCII characters before or after existing ASCII characters (or
204	/// returns two single quotes if the input was an empty string). It will never modify a
205	/// multibyte UTF-8 character.
206	pub fn quote<'a>(&self, mut in_bytes: &'a [u8]) -> Result<Cow<'a, [u8]>, QuoteError> {
207	if in_bytes.is_empty() {
208	// Empty string. Special case that isn't meaningful as only part of a word.
209	return Ok(b"''"[..].into());
210	}
211	if !self.allow_nul && in_bytes.iter().any(\|&b\| b == b'`\0`') {
212	return Err(QuoteError::Nul);
213	}
214	let mut out: Vec<u8> = Vec::new();
215	while !in_bytes.is_empty() {
216	// Pick a quoting strategy for some prefix of the input. Normally this will cover the
217	// entire input, but in some case we might need to divide the input into multiple chunks
218	// that are quoted differently.
219	let (cur_len, strategy) = quoting_strategy(in_bytes);
220	if cur_len == in_bytes.len() && strategy == QuotingStrategy::Unquoted && out.is_empty() {
221	// Entire string can be represented unquoted. Reuse the allocation.
222	return Ok(in_bytes.into());
223	}
224	let (cur_chunk, rest) = in_bytes.split_at(cur_len);
225	assert!(rest.len() < in_bytes.len()); // no infinite loop
226	in_bytes = rest;
227	append_quoted_chunk(&mut out, cur_chunk, strategy);
228	}
229	Ok(out.into())
230	}
231
232	}
233
234	#[derive(PartialEq)]
235	enum QuotingStrategy {
236	/// No quotes and no backslash escapes. (If backslash escapes would be necessary, we use a
237	/// different strategy instead.)
238	Unquoted,
239	/// Single quoted.
240	SingleQuoted,
241	/// Double quotes, potentially with backslash escapes.
242	DoubleQuoted,
243	// TODO: add $'xxx' and "$(printf 'xxx')" styles
244	}
245
246	/// Is this ASCII byte okay to emit unquoted?
247	const fn unquoted_ok(c: u8) -> bool {
248	match c as char {
249	// Allowed characters:
250	'+' \| '-' \| '.' \| '/' \| ':' \| '@' \| ']' \| '_' \|
251	'0'..='9' \| 'A'..='Z' \| 'a'..='z'
252	=> `true`,
253
254	// Non-allowed characters:
255	// From POSIX https://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html
256	// "The application shall quote the following characters if they are to represent themselves:"
257	'\|' \| '&' \| ';' \| '<' \| '>' \| '(' \| ')' \| '$' \| '`' \| '`\\`' \| '"' \| '`\'`' \| ' ' \| '`\t`' \| '`\n`' \|
258	// "and the following may need to be quoted under certain circumstances[..]:"
259	'*' \| '?' \| '[' \| '#' \| '~' \| '=' \| '%' \|
260	// Brace expansion. These ought to be in the POSIX list but aren't yet;
261	// see: https://www.austingroupbugs.net/view.php?id=1193
262	'{' \| '}' \|
263	// Also quote comma, just to be safe in the extremely odd case that the user of this crate
264	// is intentionally placing a quoted string inside a brace expansion, e.g.:
265	// format!("echo foo{{a,b,{}}}" \| shlex::quote(some_str))
266	',' \|
267	// '\r' is allowed in a word by all real shells I tested, but is treated as a word
268	// separator by Python `shlex` \| and might be translated to '\n' in interactive mode.
269	'`\r`' \|
270	// '!' and '^' are treated specially in interactive mode; see quoting_warning.
271	'!' \| '^' \|
272	// Nul bytes and control characters.
273	'`\x00`' ..= '`\x1f`' \| '`\x7f`'
274	=> `false`,
275	'`\u{80}`' ..= '`\u{10ffff}`' => {
276	// This is unreachable since `unquoted_ok` is only called for 0..128.
277	// Non-ASCII bytes are handled separately in `quoting_strategy`.
278	// Can't call unreachable!() from `const fn` on old Rust, so...
279	unquoted_ok(c)
280	},
281	}
282	// Note: The logic cited above for quoting comma might suggest that `..` should also be quoted,
283	// it as a special case of brace expansion). But it's not necessary. There are three cases:
284	//
285	// 1. The user wants comma-based brace expansion, but the untrusted string being `quote`d
286	// contains `..`, so they get something like `{foo,bar,3..5}`.
287	// => That's safe; both Bash and Zsh expand this to `foo bar 3..5` rather than
288	// `foo bar 3 4 5`. The presence of commas disables sequence expression expansion.
289	//
290	// 2. The user wants comma-based brace expansion where the contents of the braces are a
291	// variable number of `quote`d strings and nothing else. There happens to be exactly
292	// one string and it contains `..`, so they get something like `{3..5}`.
293	// => Then this will expand as a sequence expression, which is unintended. But I don't mind,
294	// because any such code is already buggy. Suppose the untrusted string didn't* contain*
295	// `,` or `..`, resulting in shell input like `{foo}`. Then the shell would interpret it
296	// as the literal string `{foo}` rather than brace-expanding it into `foo`.
297	//
298	// 3. The user wants a sequence expression and wants to supply an untrusted string as one of
299	// the endpoints or the increment.
300	// => Well, that's just silly, since the endpoints can only be numbers or single letters.
301	}
302
303	/// Optimized version of `unquoted_ok`.
304	fn unquoted_ok_fast(c: u8) -> bool {
305	const UNQUOTED_OK_MASK: u128 = {
306	// Make a mask of all bytes in 0..<0x80 that pass.
307	let mut c: u8 = `0u8`;
308	let mut mask: u128 = `0u128`;
309	while c < `0x80` {
310	if unquoted_ok(c) {
311	mask \|= `1u128` << c;
312	}
313	c += `1`;
314	}
315	mask
316	};
317	((UNQUOTED_OK_MASK >> c) & `1`) != `0`
318	}
319
320	/// Is this ASCII byte okay to emit in single quotes?
321	fn single_quoted_ok(c: u8) -> bool {
322	match c {
323	// No single quotes in single quotes.
324	b'`\'`' => `false`,
325	// To work around a Bash bug, ^ is only allowed right after an opening single quote; see
326	// quoting_warning.
327	b'^' => `false`,
328	// Backslashes in single quotes are literal according to POSIX, but Fish treats them as an
329	// escape character. Ban them. Fish doesn't aim to be POSIX-compatible, but we can
330	// achieve Fish compatibility using double quotes, so we might as well.
331	b'`\\`' => `false`,
332	_ => `true`
333	}
334	}
335
336	/// Is this ASCII byte okay to emit in double quotes?
337	fn double_quoted_ok(c: u8) -> bool {
338	match c {
339	// Work around Python `shlex` bug where parsing "\`" and "\$" doesn't strip the
340	// backslash, even though POSIX requires it.
341	b'`' \| b'$' => `false`,
342	// '!' and '^' are treated specially in interactive mode; see quoting_warning.
343	b'!' \| b'^' => `false`,
344	_ => `true`
345	}
346	}
347
348	/// Given an input, return a quoting strategy that can cover some prefix of the string, along with
349	/// the size of that prefix.
350	///
351	/// Precondition: input size is nonzero. (Empty strings are handled by the caller.)
352	/// Postcondition: returned size is nonzero.
353	#[cfg_attr(manual_codegen_check, inline(never))]
354	fn quoting_strategy(in_bytes: &[u8]) -> (usize, QuotingStrategy) {
355	const UNQUOTED_OK: u8 = `1`;
356	const SINGLE_QUOTED_OK: u8 = `2`;
357	const DOUBLE_QUOTED_OK: u8 = `4`;
358
359	let mut prev_ok = SINGLE_QUOTED_OK \| DOUBLE_QUOTED_OK \| UNQUOTED_OK;
360	let mut i = `0`;
361
362	if in_bytes[`0`] == b'^' {
363	// To work around a Bash bug, ^ is only allowed right after an opening single quote; see
364	// quoting_warning.
365	prev_ok = SINGLE_QUOTED_OK;
366	i = `1`;
367	}
368
369	while i < in_bytes.len() {
370	let c = in_bytes[i];
371	let mut cur_ok = prev_ok;
372
373	if c >= `0x80` {
374	// Normally, non-ASCII characters shouldn't require quoting, but see quoting_warning.md
375	// about \xa0. For now, just treat all non-ASCII characters as requiring quotes. This
376	// also ensures things are safe in the off-chance that you're in a legacy 8-bit locale that
377	// has additional characters satisfying `isblank`.
378	cur_ok &= !UNQUOTED_OK;
379	} else {
380	if !unquoted_ok_fast(c) {
381	cur_ok &= !UNQUOTED_OK;
382	}
383	if !single_quoted_ok(c){
384	cur_ok &= !SINGLE_QUOTED_OK;
385	}
386	if !double_quoted_ok(c) {
387	cur_ok &= !DOUBLE_QUOTED_OK;
388	}
389	}
390
391	if cur_ok == `0` {
392	// There are no quoting strategies that would work for both the previous characters and
393	// this one. So we have to end the chunk before this character. The caller will call
394	// `quoting_strategy` again to handle the rest of the string.
395	break;
396	}
397
398	prev_ok = cur_ok;
399	i += `1`;
400	}
401
402	// Pick the best allowed strategy.
403	let strategy = if prev_ok & UNQUOTED_OK != `0` {
404	QuotingStrategy::Unquoted
405	} else if prev_ok & SINGLE_QUOTED_OK != `0` {
406	QuotingStrategy::SingleQuoted
407	} else if prev_ok & DOUBLE_QUOTED_OK != `0` {
408	QuotingStrategy::DoubleQuoted
409	} else {
410	unreachable!()
411	};
412	debug_assert!(i > `0`);
413	(i, strategy)
414	}
415
416	fn append_quoted_chunk(out: &mut Vec<u8>, cur_chunk: &[u8], strategy: QuotingStrategy) {
417	match strategy {
418	QuotingStrategy::Unquoted => {
419	out.extend_from_slice(cur_chunk);
420	},
421	QuotingStrategy::SingleQuoted => {
422	out.reserve(cur_chunk.len() + `2`);
423	out.push(b'`\'`');
424	out.extend_from_slice(cur_chunk);
425	out.push(b'`\'`');
426	},
427	QuotingStrategy::DoubleQuoted => {
428	out.reserve(cur_chunk.len() + `2`);
429	out.push(b'"');
430	for &c in cur_chunk.into_iter() {
431	if let b'$' \| b'`' \| b'"' \| b'`\\`' = c {
432	// Add a preceding backslash.
433	// Note: We shouldn't actually get here for $ and ` because they don't pass
434	// `double_quoted_ok`.
435	out.push(b'`\\`');
436	}
437	// Add the character itself.
438	out.push(c);
439	}
440	out.push(b'"');
441	},
442	}
443	}
444
445	/// Convenience function that consumes an iterable of words and turns it into a single byte string,
446	/// quoting words when necessary. Consecutive words will be separated by a single space.
447	///
448	/// Uses default settings except that nul bytes are passed through, which [may be
449	/// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated.
450	///
451	/// Equivalent to [`Quoter::new().allow_nul(true).join(words).unwrap()`](Quoter).
452	///
453	/// (That configuration never returns `Err`, so this function does not panic.)
454	///
455	/// The string equivalent is [shlex::join].
456	#[deprecated(since = "1.3.0", note = "replace with `try_join(words)?` to avoid nul byte danger")]
457	pub fn join<'a, I: IntoIterator<Item = &'a [u8]>>(words: I) -> Vec<u8> {
458	Quoter::new().allow_nul(allow:`true`).join(words).unwrap()
459	}
460
461	/// Convenience function that consumes an iterable of words and turns it into a single byte string,
462	/// quoting words when necessary. Consecutive words will be separated by a single space.
463	///
464	/// Uses default settings. The only error that can be returned is [`QuoteError::Nul`].
465	///
466	/// Equivalent to [`Quoter::new().join(words)`](Quoter).
467	///
468	/// The string equivalent is [shlex::try_join].
469	pub fn try_join<'a, I: IntoIterator<Item = &'a [u8]>>(words: I) -> Result<Vec<u8>, QuoteError> {
470	Quoter::new().join(words)
471	}
472
473	/// Given a single word, return a string suitable to encode it as a shell argument.
474	///
475	/// Uses default settings except that nul bytes are passed through, which [may be
476	/// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated.
477	///
478	/// Equivalent to [`Quoter::new().allow_nul(true).quote(in_bytes).unwrap()`](Quoter).
479	///
480	/// (That configuration never returns `Err`, so this function does not panic.)
481	///
482	/// The string equivalent is [shlex::quote].
483	#[deprecated(since = "1.3.0", note = "replace with `try_quote(str)?` to avoid nul byte danger")]
484	pub fn quote(in_bytes: &[u8]) -> Cow<[u8]> {
485	Quoter::new().allow_nul(allow:`true`).quote(in_bytes).unwrap()
486	}
487
488	/// Given a single word, return a string suitable to encode it as a shell argument.
489	///
490	/// Uses default settings. The only error that can be returned is [`QuoteError::Nul`].
491	///
492	/// Equivalent to [`Quoter::new().quote(in_bytes)`](Quoter).
493	///
494	/// (That configuration never returns `Err`, so this function does not panic.)
495	///
496	/// The string equivalent is [shlex::try_quote].
497	pub fn try_quote(in_bytes: &[u8]) -> Result<Cow<[u8]>, QuoteError> {
498	Quoter::new().quote(in_bytes)
499	}
500
501	#[cfg(test)]
502	const INVALID_UTF8: &[u8] = b"`\xa1`";
503	#[cfg(test)]
504	const INVALID_UTF8_SINGLEQUOTED: &[u8] = b"'`\xa1`'";
505
506	#[test]
507	#[allow(invalid_from_utf8)]
508	fn test_invalid_utf8() {
509	// Check that our test string is actually invalid UTF-8.
510	assert!(core::str::from_utf8(INVALID_UTF8).is_err());
511	}
512
513	#[cfg(test)]
514	static SPLIT_TEST_ITEMS: &'static [(&'static [u8], Option<&'static [&'static [u8]]>)] = &[
515	(b"foo$baz", Some(&[b"foo$baz"])),
516	(b"foo baz", Some(&[b"foo", b"baz"])),
517	(b"foo`\"`bar`\"`baz", Some(&[b"foobarbaz"])),
518	(b"foo `\"`bar`\"`baz", Some(&[b"foo", b"barbaz"])),
519	(b" foo `\n`bar", Some(&[b"foo", b"bar"])),
520	(b"foo`\\\n`bar", Some(&[b"foobar"])),
521	(b"`\"`foo`\\\n`bar`\"`", Some(&[b"foobar"])),
522	(b"'baz`\\`$b'", Some(&[b"baz`\\`$b"])),
523	(b"'baz`\\\'`'", None),
524	(b"`\\`", None),
525	(b"`\"\\`", None),
526	(b"'`\\`", None),
527	(b"`\"`", None),
528	(b"'", None),
529	(b"foo #bar`\n`baz", Some(&[b"foo", b"baz"])),
530	(b"foo #bar", Some(&[b"foo"])),
531	(b"foo#bar", Some(&[b"foo#bar"])),
532	(b"foo`\"`#bar", None),
533	(b"'`\\`n'", Some(&[b"`\\`n"])),
534	(b"'`\\\\`n'", Some(&[b"`\\\\`n"])),
535	(INVALID_UTF8, Some(&[INVALID_UTF8])),
536	];
537
538	#[test]
539	fn test_split() {
540	for &(input, output) in SPLIT_TEST_ITEMS {
541	assert_eq!(split(input), output.map(\|o\| o.iter().map(\|&x\| x.to_owned()).collect()));
542	}
543	}
544
545	#[test]
546	fn test_lineno() {
547	let mut sh = Shlex::new(b"`\n`foo`\n`bar");
548	while let Some(word) = sh.next() {
549	if word == b"bar" {
550	assert_eq!(sh.line_no, `3`);
551	}
552	}
553	}
554
555	#[test]
556	#[allow(deprecated)]
557	fn test_quote() {
558	// Validate behavior with invalid UTF-8:
559	assert_eq!(quote(INVALID_UTF8), INVALID_UTF8_SINGLEQUOTED);
560	// Replicate a few tests from lib.rs. No need to replicate all of them.
561	assert_eq!(quote(b""), &b"''"[..]);
562	assert_eq!(quote(b"foobar"), &b"foobar"[..]);
563	assert_eq!(quote(b"foo bar"), &b"'foo bar'"[..]);
564	assert_eq!(quote(b"'`\"`"), &b"`\"`'`\\\"\"`"[..]);
565	assert_eq!(quote(b""), &b"''"[..]);
566	}
567
568	#[test]
569	#[allow(deprecated)]
570	fn test_join() {
571	// Validate behavior with invalid UTF-8:
572	assert_eq!(join(vec![INVALID_UTF8]), INVALID_UTF8_SINGLEQUOTED);
573	// Replicate a few tests from lib.rs. No need to replicate all of them.
574	assert_eq!(join(vec![]), &b""[..]);
575	assert_eq!(join(vec![&b""[..]]), b"''");
576	}
577