blob: 6ff8c9740eb3db24f28d4d5bf69f2970678cb5d2 [file] [log] [blame]
// Copyright 2015 Nicholas Allegra (comex).
// Licensed under the Apache License, Version 2.0 <http://www.apache.org/licenses/LICENSE-2.0> or
// the MIT license <http://opensource.org/licenses/MIT>, at your option. This file may not be
// copied, modified, or distributed except according to those terms.
//! Same idea as (but implementation not directly based on) the Python shlex module. However, this
//! implementation does not support any of the Python module's customization because it makes
//! parsing slower and is fairly useless. You only get the default settings of shlex.split, which
//! mimic the POSIX shell:
//! http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html
//!
//! This implementation also deviates from the Python version in not treating \r specially, which I
//! believe is more compliant.
//!
//! The algorithms in this crate are oblivious to UTF-8 high bytes, so they iterate over the bytes
//! directly as a micro-optimization.
use std::borrow::Cow;
/// An iterator that takes an input string and splits it into the words using the same syntax as
/// the POSIX shell.
pub struct Shlex<'a> {
in_iter: std::str::Bytes<'a>,
/// The number of newlines read so far, plus one.
pub line_no: usize,
/// An input string is erroneous if it ends while inside a quotation or right after an
/// unescaped backslash. Since Iterator does not have a mechanism to return an error, if that
/// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to
/// true; best to check it after you're done iterating.
pub had_error: bool,
}
impl<'a> Shlex<'a> {
pub fn new(in_str: &'a str) -> Self {
Shlex {
in_iter: in_str.bytes(),
line_no: 1,
had_error: false,
}
}
fn parse_word(&mut self, mut ch: u8) -> Option<String> {
let mut result: Vec<u8> = Vec::new();
loop {
match ch as char {
'"' => if let Err(()) = self.parse_double(&mut result) {
self.had_error = true;
return None;
},
'\'' => if let Err(()) = self.parse_single(&mut result) {
self.had_error = true;
return None;
},
'\\' => if let Some(ch2) = self.next_char() {
if ch2 != '\n' as u8 { result.push(ch2); }
} else {
self.had_error = true;
return None;
},
' ' | '\t' | '\n' => { break; },
_ => { result.push(ch as u8); },
}
if let Some(ch2) = self.next_char() { ch = ch2; } else { break; }
}
unsafe { Some(String::from_utf8_unchecked(result)) }
}
fn parse_double(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
loop {
if let Some(ch2) = self.next_char() {
match ch2 as char {
'\\' => {
if let Some(ch3) = self.next_char() {
match ch3 as char {
// \$ => $
'$' | '`' | '"' | '\\' => { result.push(ch3); },
// \<newline> => nothing
'\n' => {},
// \x => =x
_ => { result.push('\\' as u8); result.push(ch3); }
}
} else {
return Err(());
}
},
'"' => { return Ok(()); },
_ => { result.push(ch2); },
}
} else {
return Err(());
}
}
}
fn parse_single(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
loop {
if let Some(ch2) = self.next_char() {
match ch2 as char {
'\\' => {
if let Some(ch3) = self.next_char() {
match ch3 as char {
// for single quotes, only these can be escaped
'\'' | '\\' => { result.push(ch3); },
_ => { result.push('\\' as u8); result.push(ch3); }
}
} else {
return Err(());
}
},
'\'' => { return Ok(()); },
_ => { result.push(ch2); },
}
} else {
return Err(());
}
}
}
fn next_char(&mut self) -> Option<u8> {
let res = self.in_iter.next();
if res == Some('\n' as u8) { self.line_no += 1; }
res
}
}
impl<'a> Iterator for Shlex<'a> {
type Item = String;
fn next(&mut self) -> Option<String> {
if let Some(mut ch) = self.next_char() {
// skip initial whitespace
loop {
match ch as char {
' ' | '\t' | '\n' => {},
'#' => {
while let Some(ch2) = self.next_char() {
if ch2 as char == '\n' { break; }
}
},
_ => { break; }
}
if let Some(ch2) = self.next_char() { ch = ch2; } else { return None; }
}
self.parse_word(ch)
} else { // no initial character
None
}
}
}
/// Convenience function that consumes the whole string at once. Returns None if the input was
/// erroneous.
pub fn split(in_str: &str) -> Option<Vec<String>> {
let mut shl = Shlex::new(in_str);
let res = shl.by_ref().collect();
if shl.had_error { None } else { Some(res) }
}
/// Given a single word, return a string suitable to encode it as a shell argument.
pub fn quote(in_str: &str) -> Cow<str> {
if in_str.len() == 0 {
"\"\"".into()
} else if in_str.bytes().any(|c| match c as char {
'|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t' |
'\r' | '\n' | '*' | '?' | '[' | '#' | '~' | '=' | '%' => true,
_ => false
}) {
let mut out: Vec<u8> = Vec::new();
out.push('"' as u8);
for c in in_str.bytes() {
match c as char {
'$' | '`' | '"' | '\\' => out.push('\\' as u8),
_ => ()
}
out.push(c);
}
out.push('"' as u8);
unsafe { String::from_utf8_unchecked(out) }.into()
} else {
in_str.into()
}
}
#[cfg(test)]
static SPLIT_TEST_ITEMS: &'static [(&'static str, Option<&'static [&'static str]>)] = &[
("foo$baz", Some(&["foo$baz"])),
("foo baz", Some(&["foo", "baz"])),
("foo\"bar\"baz", Some(&["foobarbaz"])),
("foo \"bar\"baz", Some(&["foo", "barbaz"])),
(" foo \nbar", Some(&["foo", "bar"])),
("foo\\\nbar", Some(&["foobar"])),
("\"foo\\\nbar\"", Some(&["foobar"])),
("'baz\\$b'", Some(&["baz\\$b"])),
("'baz\\\''", Some(&["baz\'"])),
("\\", None),
("\"\\", None),
("'\\", None),
("\"", None),
("'", None),
("foo #bar\nbaz", Some(&["foo", "baz"])),
("foo #bar", Some(&["foo"])),
("foo#bar", Some(&["foo#bar"])),
("foo\"#bar", None),
];
#[test]
fn test_split() {
for &(input, output) in SPLIT_TEST_ITEMS {
assert_eq!(split(input), output.map(|o| o.iter().map(|&x| x.to_owned()).collect()));
}
}
#[test]
fn test_lineno() {
let mut sh = Shlex::new("\nfoo\nbar");
while let Some(word) = sh.next() {
if word == "bar" {
assert_eq!(sh.line_no, 3);
}
}
}
#[test]
fn test_quote() {
assert_eq!(quote("foobar"), "foobar");
assert_eq!(quote("foo bar"), "\"foo bar\"");
assert_eq!(quote("\""), "\"\\\"\"");
assert_eq!(quote(""), "\"\"");
}