blob: f824a7a66c78593f0bab7fd30de3bef748525a90 [file] [log] [blame]
use std::char;
use std::convert::TryFrom;
use std::f64;
use std::fmt;
use std::num::ParseFloatError;
use std::num::ParseIntError;
use super::float;
use super::loc::Loc;
use super::loc::FIRST_COL;
use super::str_lit::StrLit;
use super::str_lit::StrLitDecodeError;
use super::token::Token;
use super::token::TokenWithLocation;
use super::ParserLanguage;
use crate::text_format::lexer::JsonNumberLit;
#[derive(Debug)]
pub enum LexerError {
IncorrectInput, // TODO: something better than this
UnexpectedEof,
ExpectChar(char),
ParseIntError,
ParseFloatError,
IncorrectFloatLit, // TODO: how it is different from ParseFloatError?
IncorrectJsonEscape,
IncorrectJsonNumber,
IncorrectUnicodeChar,
ExpectHexDigit,
ExpectOctDigit,
ExpectDecDigit,
StrLitDecodeError(StrLitDecodeError),
ExpectedIdent,
}
impl fmt::Display for LexerError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
LexerError::IncorrectInput => write!(f, "Incorrect input"),
LexerError::UnexpectedEof => write!(f, "Unexpected EOF"),
LexerError::ExpectChar(c) => write!(f, "Expecting char: {}", c),
LexerError::ParseIntError => write!(f, "Parse int error"),
LexerError::ParseFloatError => write!(f, "Parse float error"),
LexerError::IncorrectFloatLit => write!(f, "Incorrect float literal"),
LexerError::IncorrectJsonEscape => write!(f, "Incorrect JSON escape"),
LexerError::IncorrectJsonNumber => write!(f, "Incorrect JSON number"),
LexerError::IncorrectUnicodeChar => write!(f, "Incorrect Unicode char"),
LexerError::ExpectHexDigit => write!(f, "Expecting hex digit"),
LexerError::ExpectOctDigit => write!(f, "Expecting oct digit"),
LexerError::ExpectDecDigit => write!(f, "Expecting dec digit"),
LexerError::StrLitDecodeError(e) => write!(f, "{}", e),
LexerError::ExpectedIdent => write!(f, "Expecting identifier"),
}
}
}
impl std::error::Error for LexerError {}
pub type LexerResult<T> = Result<T, LexerError>;
impl From<StrLitDecodeError> for LexerError {
fn from(e: StrLitDecodeError) -> Self {
LexerError::StrLitDecodeError(e)
}
}
impl From<ParseIntError> for LexerError {
fn from(_: ParseIntError) -> Self {
LexerError::ParseIntError
}
}
impl From<ParseFloatError> for LexerError {
fn from(_: ParseFloatError) -> Self {
LexerError::ParseFloatError
}
}
impl From<float::ProtobufFloatParseError> for LexerError {
fn from(_: float::ProtobufFloatParseError) -> Self {
LexerError::IncorrectFloatLit
}
}
#[derive(Copy, Clone)]
pub struct Lexer<'a> {
language: ParserLanguage,
input: &'a str,
pos: usize,
pub loc: Loc,
}
fn is_letter(c: char) -> bool {
c.is_alphabetic() || c == '_'
}
impl<'a> Lexer<'a> {
pub fn new(input: &'a str, language: ParserLanguage) -> Lexer<'a> {
Lexer {
language,
input,
pos: 0,
loc: Loc::start(),
}
}
/// No more chars
pub fn eof(&self) -> bool {
self.pos == self.input.len()
}
/// Remaining chars
fn rem_chars(&self) -> &'a str {
&self.input[self.pos..]
}
pub fn lookahead_char_is<P: FnOnce(char) -> bool>(&self, p: P) -> bool {
self.lookahead_char().map_or(false, p)
}
fn lookahead_char_is_in(&self, alphabet: &str) -> bool {
self.lookahead_char_is(|c| alphabet.contains(c))
}
fn next_char_opt(&mut self) -> Option<char> {
let rem = self.rem_chars();
if rem.is_empty() {
None
} else {
let mut char_indices = rem.char_indices();
let (_, c) = char_indices.next().unwrap();
let c_len = char_indices.next().map(|(len, _)| len).unwrap_or(rem.len());
self.pos += c_len;
if c == '\n' {
self.loc.line += 1;
self.loc.col = FIRST_COL;
} else {
self.loc.col += 1;
}
Some(c)
}
}
fn next_char(&mut self) -> LexerResult<char> {
self.next_char_opt().ok_or(LexerError::UnexpectedEof)
}
/// Skip whitespaces
fn skip_whitespaces(&mut self) {
self.take_while(|c| c.is_whitespace());
}
fn skip_c_comment(&mut self) -> LexerResult<()> {
if self.skip_if_lookahead_is_str("/*") {
let end = "*/";
match self.rem_chars().find(end) {
None => Err(LexerError::UnexpectedEof),
Some(len) => {
let new_pos = self.pos + len + end.len();
self.skip_to_pos(new_pos);
Ok(())
}
}
} else {
Ok(())
}
}
fn skip_cpp_comment(&mut self) {
if self.skip_if_lookahead_is_str("//") {
loop {
match self.next_char_opt() {
Some('\n') | None => break,
_ => {}
}
}
}
}
fn skip_sh_comment(&mut self) {
if self.skip_if_lookahead_is_str("#") {
loop {
match self.next_char_opt() {
Some('\n') | None => break,
_ => {}
}
}
}
}
fn skip_comment(&mut self) -> LexerResult<()> {
match self.language {
ParserLanguage::Proto => {
self.skip_c_comment()?;
self.skip_cpp_comment();
}
ParserLanguage::TextFormat => {
self.skip_sh_comment();
}
ParserLanguage::Json => {}
}
Ok(())
}
pub fn skip_ws(&mut self) -> LexerResult<()> {
loop {
let pos = self.pos;
self.skip_whitespaces();
self.skip_comment()?;
if pos == self.pos {
// Did not advance
return Ok(());
}
}
}
pub fn take_while<F>(&mut self, f: F) -> &'a str
where
F: Fn(char) -> bool,
{
let start = self.pos;
while self.lookahead_char().map(&f) == Some(true) {
self.next_char_opt().unwrap();
}
let end = self.pos;
&self.input[start..end]
}
fn lookahead_char(&self) -> Option<char> {
self.clone().next_char_opt()
}
fn lookahead_is_str(&self, s: &str) -> bool {
self.rem_chars().starts_with(s)
}
fn skip_if_lookahead_is_str(&mut self, s: &str) -> bool {
if self.lookahead_is_str(s) {
let new_pos = self.pos + s.len();
self.skip_to_pos(new_pos);
true
} else {
false
}
}
fn next_char_if<P>(&mut self, p: P) -> Option<char>
where
P: FnOnce(char) -> bool,
{
let mut clone = self.clone();
match clone.next_char_opt() {
Some(c) if p(c) => {
*self = clone;
Some(c)
}
_ => None,
}
}
pub fn next_char_if_eq(&mut self, expect: char) -> bool {
self.next_char_if(|c| c == expect) != None
}
fn next_char_if_in(&mut self, alphabet: &str) -> Option<char> {
for c in alphabet.chars() {
if self.next_char_if_eq(c) {
return Some(c);
}
}
None
}
fn next_char_expect_eq(&mut self, expect: char) -> LexerResult<()> {
if self.next_char_if_eq(expect) {
Ok(())
} else {
Err(LexerError::ExpectChar(expect))
}
}
fn next_char_expect<P>(&mut self, expect: P, err: LexerError) -> LexerResult<char>
where
P: FnOnce(char) -> bool,
{
self.next_char_if(expect).ok_or(err)
}
// str functions
/// properly update line and column
fn skip_to_pos(&mut self, new_pos: usize) -> &'a str {
assert!(new_pos >= self.pos);
assert!(new_pos <= self.input.len());
let pos = self.pos;
while self.pos != new_pos {
self.next_char_opt().unwrap();
}
&self.input[pos..new_pos]
}
// Protobuf grammar
// char functions
// letter = "A" … "Z" | "a" … "z"
// https://github.com/google/protobuf/issues/4565
fn next_letter_opt(&mut self) -> Option<char> {
self.next_char_if(is_letter)
}
// capitalLetter = "A" … "Z"
fn _next_capital_letter_opt(&mut self) -> Option<char> {
self.next_char_if(|c| c >= 'A' && c <= 'Z')
}
fn next_ident_part(&mut self) -> Option<char> {
self.next_char_if(|c| c.is_ascii_alphanumeric() || c == '_')
}
// Identifiers
// ident = letter { letter | decimalDigit | "_" }
fn next_ident_opt(&mut self) -> LexerResult<Option<String>> {
if let Some(c) = self.next_letter_opt() {
let mut ident = String::new();
ident.push(c);
while let Some(c) = self.next_ident_part() {
ident.push(c);
}
Ok(Some(ident))
} else {
Ok(None)
}
}
// Integer literals
// hexLit = "0" ( "x" | "X" ) hexDigit { hexDigit }
fn next_hex_lit_opt(&mut self) -> LexerResult<Option<u64>> {
Ok(
if self.skip_if_lookahead_is_str("0x") || self.skip_if_lookahead_is_str("0X") {
let s = self.take_while(|c| c.is_ascii_hexdigit());
Some(u64::from_str_radix(s, 16)? as u64)
} else {
None
},
)
}
// decimalLit = ( "1" … "9" ) { decimalDigit }
// octalLit = "0" { octalDigit }
fn next_decimal_octal_lit_opt(&mut self) -> LexerResult<Option<u64>> {
// do not advance on number parse error
let mut clone = self.clone();
let pos = clone.pos;
Ok(if clone.next_char_if(|c| c.is_ascii_digit()) != None {
clone.take_while(|c| c.is_ascii_digit());
let value = clone.input[pos..clone.pos].parse()?;
*self = clone;
Some(value)
} else {
None
})
}
// hexDigit = "0" … "9" | "A" … "F" | "a" … "f"
fn next_hex_digit(&mut self) -> LexerResult<u32> {
let mut clone = self.clone();
let r = match clone.next_char()? {
c if c >= '0' && c <= '9' => c as u32 - b'0' as u32,
c if c >= 'A' && c <= 'F' => c as u32 - b'A' as u32 + 10,
c if c >= 'a' && c <= 'f' => c as u32 - b'a' as u32 + 10,
_ => return Err(LexerError::ExpectHexDigit),
};
*self = clone;
Ok(r)
}
// octalDigit = "0" … "7"
fn next_octal_digit(&mut self) -> LexerResult<u32> {
self.next_char_expect(|c| c >= '0' && c <= '9', LexerError::ExpectOctDigit)
.map(|c| c as u32 - '0' as u32)
}
// decimalDigit = "0" … "9"
fn next_decimal_digit(&mut self) -> LexerResult<u32> {
self.next_char_expect(|c| c >= '0' && c <= '9', LexerError::ExpectDecDigit)
.map(|c| c as u32 - '0' as u32)
}
// decimals = decimalDigit { decimalDigit }
fn next_decimal_digits(&mut self) -> LexerResult<()> {
self.next_decimal_digit()?;
self.take_while(|c| c >= '0' && c <= '9');
Ok(())
}
// intLit = decimalLit | octalLit | hexLit
pub fn next_int_lit_opt(&mut self) -> LexerResult<Option<u64>> {
assert_ne!(ParserLanguage::Json, self.language);
self.skip_ws()?;
if let Some(i) = self.next_hex_lit_opt()? {
return Ok(Some(i));
}
if let Some(i) = self.next_decimal_octal_lit_opt()? {
return Ok(Some(i));
}
Ok(None)
}
// Floating-point literals
// exponent = ( "e" | "E" ) [ "+" | "-" ] decimals
fn next_exponent_opt(&mut self) -> LexerResult<Option<()>> {
if self.next_char_if_in("eE") != None {
self.next_char_if_in("+-");
self.next_decimal_digits()?;
Ok(Some(()))
} else {
Ok(None)
}
}
// floatLit = ( decimals "." [ decimals ] [ exponent ] | decimals exponent | "."decimals [ exponent ] ) | "inf" | "nan"
fn next_float_lit(&mut self) -> LexerResult<()> {
assert_ne!(ParserLanguage::Json, self.language);
// "inf" and "nan" are handled as part of ident
if self.next_char_if_eq('.') {
self.next_decimal_digits()?;
self.next_exponent_opt()?;
} else {
self.next_decimal_digits()?;
if self.next_char_if_eq('.') {
self.next_decimal_digits()?;
self.next_exponent_opt()?;
} else {
if self.next_exponent_opt()? == None {
return Err(LexerError::IncorrectFloatLit);
}
}
}
Ok(())
}
// String literals
// charValue = hexEscape | octEscape | charEscape | /[^\0\n\\]/
// hexEscape = '\' ( "x" | "X" ) hexDigit hexDigit
// https://github.com/google/protobuf/issues/4560
// octEscape = '\' octalDigit octalDigit octalDigit
// charEscape = '\' ( "a" | "b" | "f" | "n" | "r" | "t" | "v" | '\' | "'" | '"' )
// quote = "'" | '"'
pub fn next_byte_value(&mut self) -> LexerResult<u8> {
match self.next_char()? {
'\\' => {
match self.next_char()? {
'\'' => Ok(b'\''),
'"' => Ok(b'"'),
'\\' => Ok(b'\\'),
'a' => Ok(b'\x07'),
'b' => Ok(b'\x08'),
'f' => Ok(b'\x0c'),
'n' => Ok(b'\n'),
'r' => Ok(b'\r'),
't' => Ok(b'\t'),
'v' => Ok(b'\x0b'),
'x' => {
let d1 = self.next_hex_digit()? as u8;
let d2 = self.next_hex_digit()? as u8;
Ok(((d1 << 4) | d2) as u8)
}
d if d >= '0' && d <= '7' => {
let mut r = d as u8 - b'0';
for _ in 0..2 {
match self.next_octal_digit() {
Err(_) => break,
Ok(d) => r = (r << 3) + d as u8,
}
}
Ok(r)
}
// https://github.com/google/protobuf/issues/4562
// TODO: overflow
c => Ok(c as u8),
}
}
'\n' | '\0' => Err(LexerError::IncorrectInput),
// TODO: check overflow
c => Ok(c as u8),
}
}
fn char_try_from(i: u32) -> LexerResult<char> {
char::try_from(i).map_err(|_| LexerError::IncorrectUnicodeChar)
}
pub fn next_json_char_value(&mut self) -> LexerResult<char> {
match self.next_char()? {
'\\' => match self.next_char()? {
'"' => Ok('"'),
'\'' => Ok('\''),
'\\' => Ok('\\'),
'/' => Ok('/'),
'b' => Ok('\x08'),
'f' => Ok('\x0c'),
'n' => Ok('\n'),
'r' => Ok('\r'),
't' => Ok('\t'),
'u' => {
let mut v = 0;
for _ in 0..4 {
let digit = self.next_hex_digit()?;
v = v * 16 + digit;
}
Self::char_try_from(v)
}
_ => Err(LexerError::IncorrectJsonEscape),
},
c => Ok(c),
}
}
// https://github.com/google/protobuf/issues/4564
// strLit = ( "'" { charValue } "'" ) | ( '"' { charValue } '"' )
fn next_str_lit_raw(&mut self) -> LexerResult<String> {
let mut raw = String::new();
let mut first = true;
loop {
if !first {
self.skip_ws()?;
}
let start = self.pos;
let q = match self.next_char_if_in("'\"") {
Some(q) => q,
None if !first => break,
None => return Err(LexerError::IncorrectInput),
};
first = false;
while self.lookahead_char() != Some(q) {
self.next_byte_value()?;
}
self.next_char_expect_eq(q)?;
raw.push_str(&self.input[start + 1..self.pos - 1]);
}
Ok(raw)
}
fn next_str_lit_raw_opt(&mut self) -> LexerResult<Option<String>> {
if self.lookahead_char_is_in("'\"") {
Ok(Some(self.next_str_lit_raw()?))
} else {
Ok(None)
}
}
/// Parse next token as JSON number
fn next_json_number_opt(&mut self) -> LexerResult<Option<JsonNumberLit>> {
assert_eq!(ParserLanguage::Json, self.language);
fn is_digit(c: char) -> bool {
c >= '0' && c <= '9'
}
fn is_digit_1_9(c: char) -> bool {
c >= '1' && c <= '9'
}
if !self.lookahead_char_is_in("-0123456789") {
return Ok(None);
}
let mut s = String::new();
if self.next_char_if_eq('-') {
s.push('-');
}
if self.next_char_if_eq('0') {
s.push('0');
} else {
s.push(self.next_char_expect(is_digit_1_9, LexerError::IncorrectJsonNumber)?);
while let Some(c) = self.next_char_if(is_digit) {
s.push(c);
}
}
if self.next_char_if_eq('.') {
s.push('.');
s.push(self.next_char_expect(is_digit, LexerError::IncorrectJsonNumber)?);
while let Some(c) = self.next_char_if(is_digit) {
s.push(c);
}
}
if let Some(c) = self.next_char_if_in("eE") {
s.push(c);
if let Some(c) = self.next_char_if_in("+-") {
s.push(c);
}
s.push(self.next_char_expect(is_digit_1_9, LexerError::IncorrectJsonNumber)?);
while let Some(c) = self.next_char_if(is_digit) {
s.push(c);
}
}
Ok(Some(JsonNumberLit(s)))
}
fn next_token_inner(&mut self) -> LexerResult<Token> {
if self.language == ParserLanguage::Json {
if let Some(v) = self.next_json_number_opt()? {
return Ok(Token::JsonNumber(v));
}
}
if let Some(ident) = self.next_ident_opt()? {
let token = if self.language != ParserLanguage::Json && ident == float::PROTOBUF_NAN {
Token::FloatLit(f64::NAN)
} else if self.language != ParserLanguage::Json && ident == float::PROTOBUF_INF {
Token::FloatLit(f64::INFINITY)
} else {
Token::Ident(ident.to_owned())
};
return Ok(token);
}
if self.language != ParserLanguage::Json {
let mut clone = self.clone();
let pos = clone.pos;
if let Ok(_) = clone.next_float_lit() {
let f = float::parse_protobuf_float(&self.input[pos..clone.pos])?;
*self = clone;
return Ok(Token::FloatLit(f));
}
if let Some(lit) = self.next_int_lit_opt()? {
return Ok(Token::IntLit(lit));
}
}
if let Some(escaped) = self.next_str_lit_raw_opt()? {
return Ok(Token::StrLit(StrLit { escaped }));
}
// This branch must be after str lit
if let Some(c) = self.next_char_if(|c| c.is_ascii_punctuation()) {
return Ok(Token::Symbol(c));
}
if let Some(ident) = self.next_ident_opt()? {
return Ok(Token::Ident(ident));
}
Err(LexerError::IncorrectInput)
}
pub fn next_token(&mut self) -> LexerResult<Option<TokenWithLocation>> {
self.skip_ws()?;
let loc = self.loc;
Ok(if self.eof() {
None
} else {
let token = self.next_token_inner()?;
// Skip whitespace here to update location
// to the beginning of the next token
self.skip_ws()?;
Some(TokenWithLocation { token, loc })
})
}
}
#[cfg(test)]
mod test {
use super::*;
fn lex<P, R>(input: &str, parse_what: P) -> R
where
P: FnOnce(&mut Lexer) -> LexerResult<R>,
{
let mut lexer = Lexer::new(input, ParserLanguage::Proto);
let r = parse_what(&mut lexer).expect(&format!("lexer failed at {}", lexer.loc));
assert!(lexer.eof(), "check eof failed at {}", lexer.loc);
r
}
fn lex_opt<P, R>(input: &str, parse_what: P) -> R
where
P: FnOnce(&mut Lexer) -> LexerResult<Option<R>>,
{
let mut lexer = Lexer::new(input, ParserLanguage::Proto);
let o = parse_what(&mut lexer).expect(&format!("lexer failed at {}", lexer.loc));
let r = o.expect(&format!("lexer returned none at {}", lexer.loc));
assert!(lexer.eof(), "check eof failed at {}", lexer.loc);
r
}
#[test]
fn test_lexer_int_lit() {
let msg = r#"10"#;
let mess = lex_opt(msg, |p| p.next_int_lit_opt());
assert_eq!(10, mess);
}
#[test]
fn test_lexer_float_lit() {
let msg = r#"12.3"#;
let mess = lex(msg, |p| p.next_token_inner());
assert_eq!(Token::FloatLit(12.3), mess);
}
}