1// Copyright 2015 Nicholas Allegra (comex).
2// Licensed under the Apache License, Version 2.0 <https://www.apache.org/licenses/LICENSE-2.0> or
3// the MIT license <https://opensource.org/licenses/MIT>, at your option. This file may not be
4// copied, modified, or distributed except according to those terms.
6//! [`Shlex`] and friends for byte strings.
8//! This is used internally by the [outer module](crate), and may be more
9//! convenient if you are working with byte slices (`[u8]`) or types that are
10//! wrappers around bytes, such as [`OsStr`](std::ffi::OsStr):
12//! ```rust
13//! #[cfg(unix)] {
14//! use shlex::bytes::quote;
15//! use std::ffi::OsStr;
16//! use std::os::unix::ffi::OsStrExt;
18//! // `\x80` is invalid in UTF-8.
19//! let os_str = OsStr::from_bytes(b"a\x80b c");
20//! assert_eq!(quote(os_str.as_bytes()), &b"'a\x80b c'"[..]);
21//! }
22//! ```
24//! (On Windows, `OsStr` uses 16 bit wide characters so this will not work.)
26extern crate alloc;
27use alloc::vec::Vec;
28use alloc::borrow::Cow;
30use alloc::vec;
32use alloc::borrow::ToOwned;
33#[cfg(all(doc, not(doctest)))]
34use crate::{self as shlex, quoting_warning};
36use super::QuoteError;
38/// An iterator that takes an input byte string and splits it into the words using the same syntax as
39/// the POSIX shell.
40pub struct Shlex<'a> {
41 in_iter: core::slice::Iter<'a, u8>,
42 /// The number of newlines read so far, plus one.
43 pub line_no: usize,
44 /// An input string is erroneous if it ends while inside a quotation or right after an
45 /// unescaped backslash. Since Iterator does not have a mechanism to return an error, if that
46 /// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to
47 /// true; best to check it after you're done iterating.
48 pub had_error: bool,
51impl<'a> Shlex<'a> {
52 pub fn new(in_bytes: &'a [u8]) -> Self {
53 Shlex {
54 in_iter: in_bytes.iter(),
55 line_no: 1,
56 had_error: false,
57 }
58 }
60 fn parse_word(&mut self, mut ch: u8) -> Option<Vec<u8>> {
61 let mut result: Vec<u8> = Vec::new();
62 loop {
63 match ch as char {
64 '"' => if let Err(()) = self.parse_double(&mut result) {
65 self.had_error = true;
66 return None;
67 },
68 '\'' => if let Err(()) = self.parse_single(&mut result) {
69 self.had_error = true;
70 return None;
71 },
72 '\\' => if let Some(ch2) = self.next_char() {
73 if ch2 != '\n' as u8 { result.push(ch2); }
74 } else {
75 self.had_error = true;
76 return None;
77 },
78 ' ' | '\t' | '\n' => { break; },
79 _ => { result.push(ch as u8); },
80 }
81 if let Some(ch2) = self.next_char() { ch = ch2; } else { break; }
82 }
83 Some(result)
84 }
86 fn parse_double(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
87 loop {
88 if let Some(ch2) = self.next_char() {
89 match ch2 as char {
90 '\\' => {
91 if let Some(ch3) = self.next_char() {
92 match ch3 as char {
93 // \$ => $
94 '$' | '`' | '"' | '\\' => { result.push(ch3); },
95 // \<newline> => nothing
96 '\n' => {},
97 // \x => =x
98 _ => { result.push('\\' as u8); result.push(ch3); }
99 }
100 } else {
101 return Err(());
102 }
103 },
104 '"' => { return Ok(()); },
105 _ => { result.push(ch2); },
106 }
107 } else {
108 return Err(());
109 }
110 }
111 }
113 fn parse_single(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
114 loop {
115 if let Some(ch2) = self.next_char() {
116 match ch2 as char {
117 '\'' => { return Ok(()); },
118 _ => { result.push(ch2); },
119 }
120 } else {
121 return Err(());
122 }
123 }
124 }
126 fn next_char(&mut self) -> Option<u8> {
127 let res = self.in_iter.next().copied();
128 if res == Some(b'\n') { self.line_no += 1; }
129 res
130 }
133impl<'a> Iterator for Shlex<'a> {
134 type Item = Vec<u8>;
135 fn next(&mut self) -> Option<Self::Item> {
136 if let Some(mut ch: u8) = self.next_char() {
137 // skip initial whitespace
138 loop {
139 match ch as char {
140 ' ' | '\t' | '\n' => {},
141 '#' => {
142 while let Some(ch2: u8) = self.next_char() {
143 if ch2 as char == '\n' { break; }
144 }
145 },
146 _ => { break; }
147 }
148 if let Some(ch2: u8) = self.next_char() { ch = ch2; } else { return None; }
149 }
150 self.parse_word(ch)
151 } else { // no initial character
152 None
153 }
154 }
158/// Convenience function that consumes the whole byte string at once. Returns None if the input was
159/// erroneous.
160pub fn split(in_bytes: &[u8]) -> Option<Vec<Vec<u8>>> {
161 let mut shl: Shlex<'_> = Shlex::new(in_bytes);
162 let res: Vec> = shl.by_ref().collect();
163 if shl.had_error { None } else { Some(res) }
166/// A more configurable interface to quote strings. If you only want the default settings you can
167/// use the convenience functions [`try_quote`] and [`try_join`].
169/// The string equivalent is [`shlex::Quoter`].
170#[derive(Default, Debug, Clone)]
171pub struct Quoter {
172 allow_nul: bool,
173 // TODO: more options
176impl Quoter {
177 /// Create a new [`Quoter`] with default settings.
178 #[inline]
179 pub fn new() -> Self {
180 Self::default()
181 }
183 /// Set whether to allow [nul bytes](quoting_warning#nul-bytes). By default they are not
184 /// allowed and will result in an error of [`QuoteError::Nul`].
185 #[inline]
186 pub fn allow_nul(mut self, allow: bool) -> Self {
187 self.allow_nul = allow;
188 self
189 }
191 /// Convenience function that consumes an iterable of words and turns it into a single byte string,
192 /// quoting words when necessary. Consecutive words will be separated by a single space.
193 pub fn join<'a, I: IntoIterator<Item = &'a [u8]>>(&self, words: I) -> Result<Vec<u8>, QuoteError> {
194 Ok(words.into_iter()
195 .map(|word| self.quote(word))
196 .collect::<Result<Vec<Cow<[u8]>>, QuoteError>>()?
197 .join(&b' '))
198 }
200 /// Given a single word, return a byte string suitable to encode it as a shell argument.
201 ///
202 /// If given valid UTF-8, this will never produce invalid UTF-8. This is because it only
203 /// ever inserts valid ASCII characters before or after existing ASCII characters (or
204 /// returns two single quotes if the input was an empty string). It will never modify a
205 /// multibyte UTF-8 character.
206 pub fn quote<'a>(&self, mut in_bytes: &'a [u8]) -> Result<Cow<'a, [u8]>, QuoteError> {
207 if in_bytes.is_empty() {
208 // Empty string. Special case that isn't meaningful as only part of a word.
209 return Ok(b"''"[..].into());
210 }
211 if !self.allow_nul && in_bytes.iter().any(|&b| b == b'\0') {
212 return Err(QuoteError::Nul);
213 }
214 let mut out: Vec<u8> = Vec::new();
215 while !in_bytes.is_empty() {
216 // Pick a quoting strategy for some prefix of the input. Normally this will cover the
217 // entire input, but in some case we might need to divide the input into multiple chunks
218 // that are quoted differently.
219 let (cur_len, strategy) = quoting_strategy(in_bytes);
220 if cur_len == in_bytes.len() && strategy == QuotingStrategy::Unquoted && out.is_empty() {
221 // Entire string can be represented unquoted. Reuse the allocation.
222 return Ok(in_bytes.into());
223 }
224 let (cur_chunk, rest) = in_bytes.split_at(cur_len);
225 assert!(rest.len() < in_bytes.len()); // no infinite loop
226 in_bytes = rest;
227 append_quoted_chunk(&mut out, cur_chunk, strategy);
228 }
229 Ok(out.into())
230 }
235enum QuotingStrategy {
236 /// No quotes and no backslash escapes. (If backslash escapes would be necessary, we use a
237 /// different strategy instead.)
238 Unquoted,
239 /// Single quoted.
240 SingleQuoted,
241 /// Double quotes, potentially with backslash escapes.
242 DoubleQuoted,
243 // TODO: add $'xxx' and "$(printf 'xxx')" styles
246/// Is this ASCII byte okay to emit unquoted?
247const fn unquoted_ok(c: u8) -> bool {
248 match c as char {
249 // Allowed characters:
250 '+' | '-' | '.' | '/' | ':' | '@' | ']' | '_' |
251 '0'..='9' | 'A'..='Z' | 'a'..='z'
252 => true,
254 // Non-allowed characters:
255 // From POSIX https://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html
256 // "The application shall quote the following characters if they are to represent themselves:"
257 '|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t' | '\n' |
258 // "and the following may need to be quoted under certain circumstances[..]:"
259 '*' | '?' | '[' | '#' | '~' | '=' | '%' |
260 // Brace expansion. These ought to be in the POSIX list but aren't yet;
261 // see: https://www.austingroupbugs.net/view.php?id=1193
262 '{' | '}' |
263 // Also quote comma, just to be safe in the extremely odd case that the user of this crate
264 // is intentionally placing a quoted string inside a brace expansion, e.g.:
265 // format!("echo foo{{a,b,{}}}" | shlex::quote(some_str))
266 ',' |
267 // '\r' is allowed in a word by all real shells I tested, but is treated as a word
268 // separator by Python `shlex` | and might be translated to '\n' in interactive mode.
269 '\r' |
270 // '!' and '^' are treated specially in interactive mode; see quoting_warning.
271 '!' | '^' |
272 // Nul bytes and control characters.
273 '\x00' ..= '\x1f' | '\x7f'
274 => false,
275 '\u{80}' ..= '\u{10ffff}' => {
276 // This is unreachable since `unquoted_ok` is only called for 0..128.
277 // Non-ASCII bytes are handled separately in `quoting_strategy`.
278 // Can't call unreachable!() from `const fn` on old Rust, so...
279 unquoted_ok(c)
280 },
281 }
282 // Note: The logic cited above for quoting comma might suggest that `..` should also be quoted,
283 // it as a special case of brace expansion). But it's not necessary. There are three cases:
284 //
285 // 1. The user wants comma-based brace expansion, but the untrusted string being `quote`d
286 // contains `..`, so they get something like `{foo,bar,3..5}`.
287 // => That's safe; both Bash and Zsh expand this to `foo bar 3..5` rather than
288 // `foo bar 3 4 5`. The presence of commas disables sequence expression expansion.
289 //
290 // 2. The user wants comma-based brace expansion where the contents of the braces are a
291 // variable number of `quote`d strings and nothing else. There happens to be exactly
292 // one string and it contains `..`, so they get something like `{3..5}`.
293 // => Then this will expand as a sequence expression, which is unintended. But I don't mind,
294 // because any such code is already buggy. Suppose the untrusted string *didn't* contain
295 // `,` or `..`, resulting in shell input like `{foo}`. Then the shell would interpret it
296 // as the literal string `{foo}` rather than brace-expanding it into `foo`.
297 //
298 // 3. The user wants a sequence expression and wants to supply an untrusted string as one of
299 // the endpoints or the increment.
300 // => Well, that's just silly, since the endpoints can only be numbers or single letters.
303/// Optimized version of `unquoted_ok`.
304fn unquoted_ok_fast(c: u8) -> bool {
305 const UNQUOTED_OK_MASK: u128 = {
306 // Make a mask of all bytes in 0..<0x80 that pass.
307 let mut c: u8 = 0u8;
308 let mut mask: u128 = 0u128;
309 while c < 0x80 {
310 if unquoted_ok(c) {
311 mask |= 1u128 << c;
312 }
313 c += 1;
314 }
315 mask
316 };
317 ((UNQUOTED_OK_MASK >> c) & 1) != 0
320/// Is this ASCII byte okay to emit in single quotes?
321fn single_quoted_ok(c: u8) -> bool {
322 match c {
323 // No single quotes in single quotes.
324 b'\'' => false,
325 // To work around a Bash bug, ^ is only allowed right after an opening single quote; see
326 // quoting_warning.
327 b'^' => false,
328 // Backslashes in single quotes are literal according to POSIX, but Fish treats them as an
329 // escape character. Ban them. Fish doesn't aim to be POSIX-compatible, but we *can*
330 // achieve Fish compatibility using double quotes, so we might as well.
331 b'\\' => false,
332 _ => true
333 }
336/// Is this ASCII byte okay to emit in double quotes?
337fn double_quoted_ok(c: u8) -> bool {
338 match c {
339 // Work around Python `shlex` bug where parsing "\`" and "\$" doesn't strip the
340 // backslash, even though POSIX requires it.
341 b'`' | b'$' => false,
342 // '!' and '^' are treated specially in interactive mode; see quoting_warning.
343 b'!' | b'^' => false,
344 _ => true
345 }
348/// Given an input, return a quoting strategy that can cover some prefix of the string, along with
349/// the size of that prefix.
351/// Precondition: input size is nonzero. (Empty strings are handled by the caller.)
352/// Postcondition: returned size is nonzero.
353#[cfg_attr(manual_codegen_check, inline(never))]
354fn quoting_strategy(in_bytes: &[u8]) -> (usize, QuotingStrategy) {
355 const UNQUOTED_OK: u8 = 1;
356 const SINGLE_QUOTED_OK: u8 = 2;
357 const DOUBLE_QUOTED_OK: u8 = 4;
360 let mut i = 0;
362 if in_bytes[0] == b'^' {
363 // To work around a Bash bug, ^ is only allowed right after an opening single quote; see
364 // quoting_warning.
365 prev_ok = SINGLE_QUOTED_OK;
366 i = 1;
367 }
369 while i < in_bytes.len() {
370 let c = in_bytes[i];
371 let mut cur_ok = prev_ok;
373 if c >= 0x80 {
374 // Normally, non-ASCII characters shouldn't require quoting, but see quoting_warning.md
375 // about \xa0. For now, just treat all non-ASCII characters as requiring quotes. This
376 // also ensures things are safe in the off-chance that you're in a legacy 8-bit locale that
377 // has additional characters satisfying `isblank`.
378 cur_ok &= !UNQUOTED_OK;
379 } else {
380 if !unquoted_ok_fast(c) {
381 cur_ok &= !UNQUOTED_OK;
382 }
383 if !single_quoted_ok(c){
384 cur_ok &= !SINGLE_QUOTED_OK;
385 }
386 if !double_quoted_ok(c) {
387 cur_ok &= !DOUBLE_QUOTED_OK;
388 }
389 }
391 if cur_ok == 0 {
392 // There are no quoting strategies that would work for both the previous characters and
393 // this one. So we have to end the chunk before this character. The caller will call
394 // `quoting_strategy` again to handle the rest of the string.
395 break;
396 }
398 prev_ok = cur_ok;
399 i += 1;
400 }
402 // Pick the best allowed strategy.
403 let strategy = if prev_ok & UNQUOTED_OK != 0 {
404 QuotingStrategy::Unquoted
405 } else if prev_ok & SINGLE_QUOTED_OK != 0 {
406 QuotingStrategy::SingleQuoted
407 } else if prev_ok & DOUBLE_QUOTED_OK != 0 {
408 QuotingStrategy::DoubleQuoted
409 } else {
410 unreachable!()
411 };
412 debug_assert!(i > 0);
413 (i, strategy)
416fn append_quoted_chunk(out: &mut Vec<u8>, cur_chunk: &[u8], strategy: QuotingStrategy) {
417 match strategy {
418 QuotingStrategy::Unquoted => {
419 out.extend_from_slice(cur_chunk);
420 },
421 QuotingStrategy::SingleQuoted => {
422 out.reserve(cur_chunk.len() + 2);
423 out.push(b'\'');
424 out.extend_from_slice(cur_chunk);
425 out.push(b'\'');
426 },
427 QuotingStrategy::DoubleQuoted => {
428 out.reserve(cur_chunk.len() + 2);
429 out.push(b'"');
430 for &c in cur_chunk.into_iter() {
431 if let b'$' | b'`' | b'"' | b'\\' = c {
432 // Add a preceding backslash.
433 // Note: We shouldn't actually get here for $ and ` because they don't pass
434 // `double_quoted_ok`.
435 out.push(b'\\');
436 }
437 // Add the character itself.
438 out.push(c);
439 }
440 out.push(b'"');
441 },
442 }
445/// Convenience function that consumes an iterable of words and turns it into a single byte string,
446/// quoting words when necessary. Consecutive words will be separated by a single space.
448/// Uses default settings except that nul bytes are passed through, which [may be
449/// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated.
451/// Equivalent to [`Quoter::new().allow_nul(true).join(words).unwrap()`](Quoter).
453/// (That configuration never returns `Err`, so this function does not panic.)
455/// The string equivalent is [shlex::join].
456#[deprecated(since = "1.3.0", note = "replace with `try_join(words)?` to avoid nul byte danger")]
457pub fn join<'a, I: IntoIterator<Item = &'a [u8]>>(words: I) -> Vec<u8> {
458 Quoter::new().allow_nul(allow:true).join(words).unwrap()
461/// Convenience function that consumes an iterable of words and turns it into a single byte string,
462/// quoting words when necessary. Consecutive words will be separated by a single space.
464/// Uses default settings. The only error that can be returned is [`QuoteError::Nul`].
466/// Equivalent to [`Quoter::new().join(words)`](Quoter).
468/// The string equivalent is [shlex::try_join].
469pub fn try_join<'a, I: IntoIterator<Item = &'a [u8]>>(words: I) -> Result<Vec<u8>, QuoteError> {
470 Quoter::new().join(words)
473/// Given a single word, return a string suitable to encode it as a shell argument.
475/// Uses default settings except that nul bytes are passed through, which [may be
476/// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated.
478/// Equivalent to [`Quoter::new().allow_nul(true).quote(in_bytes).unwrap()`](Quoter).
480/// (That configuration never returns `Err`, so this function does not panic.)
482/// The string equivalent is [shlex::quote].
483#[deprecated(since = "1.3.0", note = "replace with `try_quote(str)?` to avoid nul byte danger")]
484pub fn quote(in_bytes: &[u8]) -> Cow<[u8]> {
485 Quoter::new().allow_nul(allow:true).quote(in_bytes).unwrap()
488/// Given a single word, return a string suitable to encode it as a shell argument.
490/// Uses default settings. The only error that can be returned is [`QuoteError::Nul`].
492/// Equivalent to [`Quoter::new().quote(in_bytes)`](Quoter).
494/// (That configuration never returns `Err`, so this function does not panic.)
496/// The string equivalent is [shlex::try_quote].
497pub fn try_quote(in_bytes: &[u8]) -> Result<Cow<[u8]>, QuoteError> {
498 Quoter::new().quote(in_bytes)
502const INVALID_UTF8: &[u8] = b"\xa1";
504const INVALID_UTF8_SINGLEQUOTED: &[u8] = b"'\xa1'";
508fn test_invalid_utf8() {
509 // Check that our test string is actually invalid UTF-8.
510 assert!(core::str::from_utf8(INVALID_UTF8).is_err());
514static SPLIT_TEST_ITEMS: &'static [(&'static [u8], Option<&'static [&'static [u8]]>)] = &[
515 (b"foo$baz", Some(&[b"foo$baz"])),
516 (b"foo baz", Some(&[b"foo", b"baz"])),
517 (b"foo\"bar\"baz", Some(&[b"foobarbaz"])),
518 (b"foo \"bar\"baz", Some(&[b"foo", b"barbaz"])),
519 (b" foo \nbar", Some(&[b"foo", b"bar"])),
520 (b"foo\\\nbar", Some(&[b"foobar"])),
521 (b"\"foo\\\nbar\"", Some(&[b"foobar"])),
522 (b"'baz\\$b'", Some(&[b"baz\\$b"])),
523 (b"'baz\\\''", None),
524 (b"\\", None),
525 (b"\"\\", None),
526 (b"'\\", None),
527 (b"\"", None),
528 (b"'", None),
529 (b"foo #bar\nbaz", Some(&[b"foo", b"baz"])),
530 (b"foo #bar", Some(&[b"foo"])),
531 (b"foo#bar", Some(&[b"foo#bar"])),
532 (b"foo\"#bar", None),
533 (b"'\\n'", Some(&[b"\\n"])),
534 (b"'\\\\n'", Some(&[b"\\\\n"])),
535 (INVALID_UTF8, Some(&[INVALID_UTF8])),
539fn test_split() {
540 for &(input: &[u8], output) in SPLIT_TEST_ITEMS {
541 assert_eq!(split(input), output.map(|o| o.iter().map(|&x| x.to_owned()).collect()));
542 }
546fn test_lineno() {
547 let mut sh: Shlex<'_> = Shlex::new(in_bytes:b"\nfoo\nbar");
548 while let Some(word: Vec) = sh.next() {
549 if word == b"bar" {
550 assert_eq!(sh.line_no, 3);
551 }
552 }
557fn test_quote() {
558 // Validate behavior with invalid UTF-8:
560 // Replicate a few tests from lib.rs. No need to replicate all of them.
561 assert_eq!(quote(b""), &b"''"[..]);
562 assert_eq!(quote(b"foobar"), &b"foobar"[..]);
563 assert_eq!(quote(b"foo bar"), &b"'foo bar'"[..]);
564 assert_eq!(quote(b"'\""), &b"\"'\\\"\""[..]);
565 assert_eq!(quote(b""), &b"''"[..]);
570fn test_join() {
571 // Validate behavior with invalid UTF-8:
572 assert_eq!(join(vec![INVALID_UTF8]), INVALID_UTF8_SINGLEQUOTED);
573 // Replicate a few tests from lib.rs. No need to replicate all of them.
574 assert_eq!(join(vec![]), &b""[..]);
575 assert_eq!(join(vec![&b""[..]]), b"''");