| use std::char; |
| use std::convert::TryFrom; |
| use std::f64; |
| use std::fmt; |
| use std::num::ParseFloatError; |
| use std::num::ParseIntError; |
| |
| use super::float; |
| use super::loc::Loc; |
| use super::loc::FIRST_COL; |
| use super::str_lit::StrLit; |
| use super::str_lit::StrLitDecodeError; |
| use super::token::Token; |
| use super::token::TokenWithLocation; |
| use super::ParserLanguage; |
| use crate::text_format::lexer::JsonNumberLit; |
| |
| #[derive(Debug)] |
| pub enum LexerError { |
| IncorrectInput, // TODO: something better than this |
| UnexpectedEof, |
| ExpectChar(char), |
| ParseIntError, |
| ParseFloatError, |
| IncorrectFloatLit, // TODO: how it is different from ParseFloatError? |
| IncorrectJsonEscape, |
| IncorrectJsonNumber, |
| IncorrectUnicodeChar, |
| ExpectHexDigit, |
| ExpectOctDigit, |
| ExpectDecDigit, |
| StrLitDecodeError(StrLitDecodeError), |
| ExpectedIdent, |
| } |
| |
| impl fmt::Display for LexerError { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| match self { |
| LexerError::IncorrectInput => write!(f, "Incorrect input"), |
| LexerError::UnexpectedEof => write!(f, "Unexpected EOF"), |
| LexerError::ExpectChar(c) => write!(f, "Expecting char: {}", c), |
| LexerError::ParseIntError => write!(f, "Parse int error"), |
| LexerError::ParseFloatError => write!(f, "Parse float error"), |
| LexerError::IncorrectFloatLit => write!(f, "Incorrect float literal"), |
| LexerError::IncorrectJsonEscape => write!(f, "Incorrect JSON escape"), |
| LexerError::IncorrectJsonNumber => write!(f, "Incorrect JSON number"), |
| LexerError::IncorrectUnicodeChar => write!(f, "Incorrect Unicode char"), |
| LexerError::ExpectHexDigit => write!(f, "Expecting hex digit"), |
| LexerError::ExpectOctDigit => write!(f, "Expecting oct digit"), |
| LexerError::ExpectDecDigit => write!(f, "Expecting dec digit"), |
| LexerError::StrLitDecodeError(e) => write!(f, "{}", e), |
| LexerError::ExpectedIdent => write!(f, "Expecting identifier"), |
| } |
| } |
| } |
| |
| impl std::error::Error for LexerError {} |
| |
| pub type LexerResult<T> = Result<T, LexerError>; |
| |
| impl From<StrLitDecodeError> for LexerError { |
| fn from(e: StrLitDecodeError) -> Self { |
| LexerError::StrLitDecodeError(e) |
| } |
| } |
| |
| impl From<ParseIntError> for LexerError { |
| fn from(_: ParseIntError) -> Self { |
| LexerError::ParseIntError |
| } |
| } |
| |
| impl From<ParseFloatError> for LexerError { |
| fn from(_: ParseFloatError) -> Self { |
| LexerError::ParseFloatError |
| } |
| } |
| |
| impl From<float::ProtobufFloatParseError> for LexerError { |
| fn from(_: float::ProtobufFloatParseError) -> Self { |
| LexerError::IncorrectFloatLit |
| } |
| } |
| |
| #[derive(Copy, Clone)] |
| pub struct Lexer<'a> { |
| language: ParserLanguage, |
| input: &'a str, |
| pos: usize, |
| pub loc: Loc, |
| } |
| |
| fn is_letter(c: char) -> bool { |
| c.is_alphabetic() || c == '_' |
| } |
| |
| impl<'a> Lexer<'a> { |
| pub fn new(input: &'a str, language: ParserLanguage) -> Lexer<'a> { |
| Lexer { |
| language, |
| input, |
| pos: 0, |
| loc: Loc::start(), |
| } |
| } |
| |
| /// No more chars |
| pub fn eof(&self) -> bool { |
| self.pos == self.input.len() |
| } |
| |
| /// Remaining chars |
| fn rem_chars(&self) -> &'a str { |
| &self.input[self.pos..] |
| } |
| |
| pub fn lookahead_char_is<P: FnOnce(char) -> bool>(&self, p: P) -> bool { |
| self.lookahead_char().map_or(false, p) |
| } |
| |
| fn lookahead_char_is_in(&self, alphabet: &str) -> bool { |
| self.lookahead_char_is(|c| alphabet.contains(c)) |
| } |
| |
| fn next_char_opt(&mut self) -> Option<char> { |
| let rem = self.rem_chars(); |
| if rem.is_empty() { |
| None |
| } else { |
| let mut char_indices = rem.char_indices(); |
| let (_, c) = char_indices.next().unwrap(); |
| let c_len = char_indices.next().map(|(len, _)| len).unwrap_or(rem.len()); |
| self.pos += c_len; |
| if c == '\n' { |
| self.loc.line += 1; |
| self.loc.col = FIRST_COL; |
| } else { |
| self.loc.col += 1; |
| } |
| Some(c) |
| } |
| } |
| |
| fn next_char(&mut self) -> LexerResult<char> { |
| self.next_char_opt().ok_or(LexerError::UnexpectedEof) |
| } |
| |
| /// Skip whitespaces |
| fn skip_whitespaces(&mut self) { |
| self.take_while(|c| c.is_whitespace()); |
| } |
| |
| fn skip_c_comment(&mut self) -> LexerResult<()> { |
| if self.skip_if_lookahead_is_str("/*") { |
| let end = "*/"; |
| match self.rem_chars().find(end) { |
| None => Err(LexerError::UnexpectedEof), |
| Some(len) => { |
| let new_pos = self.pos + len + end.len(); |
| self.skip_to_pos(new_pos); |
| Ok(()) |
| } |
| } |
| } else { |
| Ok(()) |
| } |
| } |
| |
| fn skip_cpp_comment(&mut self) { |
| if self.skip_if_lookahead_is_str("//") { |
| loop { |
| match self.next_char_opt() { |
| Some('\n') | None => break, |
| _ => {} |
| } |
| } |
| } |
| } |
| |
| fn skip_sh_comment(&mut self) { |
| if self.skip_if_lookahead_is_str("#") { |
| loop { |
| match self.next_char_opt() { |
| Some('\n') | None => break, |
| _ => {} |
| } |
| } |
| } |
| } |
| |
| fn skip_comment(&mut self) -> LexerResult<()> { |
| match self.language { |
| ParserLanguage::Proto => { |
| self.skip_c_comment()?; |
| self.skip_cpp_comment(); |
| } |
| ParserLanguage::TextFormat => { |
| self.skip_sh_comment(); |
| } |
| ParserLanguage::Json => {} |
| } |
| Ok(()) |
| } |
| |
| pub fn skip_ws(&mut self) -> LexerResult<()> { |
| loop { |
| let pos = self.pos; |
| self.skip_whitespaces(); |
| self.skip_comment()?; |
| if pos == self.pos { |
| // Did not advance |
| return Ok(()); |
| } |
| } |
| } |
| |
| pub fn take_while<F>(&mut self, f: F) -> &'a str |
| where |
| F: Fn(char) -> bool, |
| { |
| let start = self.pos; |
| while self.lookahead_char().map(&f) == Some(true) { |
| self.next_char_opt().unwrap(); |
| } |
| let end = self.pos; |
| &self.input[start..end] |
| } |
| |
| fn lookahead_char(&self) -> Option<char> { |
| self.clone().next_char_opt() |
| } |
| |
| fn lookahead_is_str(&self, s: &str) -> bool { |
| self.rem_chars().starts_with(s) |
| } |
| |
| fn skip_if_lookahead_is_str(&mut self, s: &str) -> bool { |
| if self.lookahead_is_str(s) { |
| let new_pos = self.pos + s.len(); |
| self.skip_to_pos(new_pos); |
| true |
| } else { |
| false |
| } |
| } |
| |
| fn next_char_if<P>(&mut self, p: P) -> Option<char> |
| where |
| P: FnOnce(char) -> bool, |
| { |
| let mut clone = self.clone(); |
| match clone.next_char_opt() { |
| Some(c) if p(c) => { |
| *self = clone; |
| Some(c) |
| } |
| _ => None, |
| } |
| } |
| |
| pub fn next_char_if_eq(&mut self, expect: char) -> bool { |
| self.next_char_if(|c| c == expect) != None |
| } |
| |
| fn next_char_if_in(&mut self, alphabet: &str) -> Option<char> { |
| for c in alphabet.chars() { |
| if self.next_char_if_eq(c) { |
| return Some(c); |
| } |
| } |
| None |
| } |
| |
| fn next_char_expect_eq(&mut self, expect: char) -> LexerResult<()> { |
| if self.next_char_if_eq(expect) { |
| Ok(()) |
| } else { |
| Err(LexerError::ExpectChar(expect)) |
| } |
| } |
| |
| fn next_char_expect<P>(&mut self, expect: P, err: LexerError) -> LexerResult<char> |
| where |
| P: FnOnce(char) -> bool, |
| { |
| self.next_char_if(expect).ok_or(err) |
| } |
| |
| // str functions |
| |
| /// properly update line and column |
| fn skip_to_pos(&mut self, new_pos: usize) -> &'a str { |
| assert!(new_pos >= self.pos); |
| assert!(new_pos <= self.input.len()); |
| let pos = self.pos; |
| while self.pos != new_pos { |
| self.next_char_opt().unwrap(); |
| } |
| &self.input[pos..new_pos] |
| } |
| |
| // Protobuf grammar |
| |
| // char functions |
| |
| // letter = "A" … "Z" | "a" … "z" |
| // https://github.com/google/protobuf/issues/4565 |
| fn next_letter_opt(&mut self) -> Option<char> { |
| self.next_char_if(is_letter) |
| } |
| |
| // capitalLetter = "A" … "Z" |
| fn _next_capital_letter_opt(&mut self) -> Option<char> { |
| self.next_char_if(|c| c >= 'A' && c <= 'Z') |
| } |
| |
| fn next_ident_part(&mut self) -> Option<char> { |
| self.next_char_if(|c| c.is_ascii_alphanumeric() || c == '_') |
| } |
| |
| // Identifiers |
| |
| // ident = letter { letter | decimalDigit | "_" } |
| fn next_ident_opt(&mut self) -> LexerResult<Option<String>> { |
| if let Some(c) = self.next_letter_opt() { |
| let mut ident = String::new(); |
| ident.push(c); |
| while let Some(c) = self.next_ident_part() { |
| ident.push(c); |
| } |
| Ok(Some(ident)) |
| } else { |
| Ok(None) |
| } |
| } |
| |
| // Integer literals |
| |
| // hexLit = "0" ( "x" | "X" ) hexDigit { hexDigit } |
| fn next_hex_lit_opt(&mut self) -> LexerResult<Option<u64>> { |
| Ok( |
| if self.skip_if_lookahead_is_str("0x") || self.skip_if_lookahead_is_str("0X") { |
| let s = self.take_while(|c| c.is_ascii_hexdigit()); |
| Some(u64::from_str_radix(s, 16)? as u64) |
| } else { |
| None |
| }, |
| ) |
| } |
| |
| // decimalLit = ( "1" … "9" ) { decimalDigit } |
| // octalLit = "0" { octalDigit } |
| fn next_decimal_octal_lit_opt(&mut self) -> LexerResult<Option<u64>> { |
| // do not advance on number parse error |
| let mut clone = self.clone(); |
| |
| let pos = clone.pos; |
| |
| Ok(if clone.next_char_if(|c| c.is_ascii_digit()) != None { |
| clone.take_while(|c| c.is_ascii_digit()); |
| let value = clone.input[pos..clone.pos].parse()?; |
| *self = clone; |
| Some(value) |
| } else { |
| None |
| }) |
| } |
| |
| // hexDigit = "0" … "9" | "A" … "F" | "a" … "f" |
| fn next_hex_digit(&mut self) -> LexerResult<u32> { |
| let mut clone = self.clone(); |
| let r = match clone.next_char()? { |
| c if c >= '0' && c <= '9' => c as u32 - b'0' as u32, |
| c if c >= 'A' && c <= 'F' => c as u32 - b'A' as u32 + 10, |
| c if c >= 'a' && c <= 'f' => c as u32 - b'a' as u32 + 10, |
| _ => return Err(LexerError::ExpectHexDigit), |
| }; |
| *self = clone; |
| Ok(r) |
| } |
| |
| // octalDigit = "0" … "7" |
| fn next_octal_digit(&mut self) -> LexerResult<u32> { |
| self.next_char_expect(|c| c >= '0' && c <= '9', LexerError::ExpectOctDigit) |
| .map(|c| c as u32 - '0' as u32) |
| } |
| |
| // decimalDigit = "0" … "9" |
| fn next_decimal_digit(&mut self) -> LexerResult<u32> { |
| self.next_char_expect(|c| c >= '0' && c <= '9', LexerError::ExpectDecDigit) |
| .map(|c| c as u32 - '0' as u32) |
| } |
| |
| // decimals = decimalDigit { decimalDigit } |
| fn next_decimal_digits(&mut self) -> LexerResult<()> { |
| self.next_decimal_digit()?; |
| self.take_while(|c| c >= '0' && c <= '9'); |
| Ok(()) |
| } |
| |
| // intLit = decimalLit | octalLit | hexLit |
| pub fn next_int_lit_opt(&mut self) -> LexerResult<Option<u64>> { |
| assert_ne!(ParserLanguage::Json, self.language); |
| |
| self.skip_ws()?; |
| if let Some(i) = self.next_hex_lit_opt()? { |
| return Ok(Some(i)); |
| } |
| if let Some(i) = self.next_decimal_octal_lit_opt()? { |
| return Ok(Some(i)); |
| } |
| Ok(None) |
| } |
| |
| // Floating-point literals |
| |
| // exponent = ( "e" | "E" ) [ "+" | "-" ] decimals |
| fn next_exponent_opt(&mut self) -> LexerResult<Option<()>> { |
| if self.next_char_if_in("eE") != None { |
| self.next_char_if_in("+-"); |
| self.next_decimal_digits()?; |
| Ok(Some(())) |
| } else { |
| Ok(None) |
| } |
| } |
| |
| // floatLit = ( decimals "." [ decimals ] [ exponent ] | decimals exponent | "."decimals [ exponent ] ) | "inf" | "nan" |
| fn next_float_lit(&mut self) -> LexerResult<()> { |
| assert_ne!(ParserLanguage::Json, self.language); |
| |
| // "inf" and "nan" are handled as part of ident |
| if self.next_char_if_eq('.') { |
| self.next_decimal_digits()?; |
| self.next_exponent_opt()?; |
| } else { |
| self.next_decimal_digits()?; |
| if self.next_char_if_eq('.') { |
| self.next_decimal_digits()?; |
| self.next_exponent_opt()?; |
| } else { |
| if self.next_exponent_opt()? == None { |
| return Err(LexerError::IncorrectFloatLit); |
| } |
| } |
| } |
| Ok(()) |
| } |
| |
| // String literals |
| |
| // charValue = hexEscape | octEscape | charEscape | /[^\0\n\\]/ |
| // hexEscape = '\' ( "x" | "X" ) hexDigit hexDigit |
| // https://github.com/google/protobuf/issues/4560 |
| // octEscape = '\' octalDigit octalDigit octalDigit |
| // charEscape = '\' ( "a" | "b" | "f" | "n" | "r" | "t" | "v" | '\' | "'" | '"' ) |
| // quote = "'" | '"' |
| pub fn next_byte_value(&mut self) -> LexerResult<u8> { |
| match self.next_char()? { |
| '\\' => { |
| match self.next_char()? { |
| '\'' => Ok(b'\''), |
| '"' => Ok(b'"'), |
| '\\' => Ok(b'\\'), |
| 'a' => Ok(b'\x07'), |
| 'b' => Ok(b'\x08'), |
| 'f' => Ok(b'\x0c'), |
| 'n' => Ok(b'\n'), |
| 'r' => Ok(b'\r'), |
| 't' => Ok(b'\t'), |
| 'v' => Ok(b'\x0b'), |
| 'x' => { |
| let d1 = self.next_hex_digit()? as u8; |
| let d2 = self.next_hex_digit()? as u8; |
| Ok(((d1 << 4) | d2) as u8) |
| } |
| d if d >= '0' && d <= '7' => { |
| let mut r = d as u8 - b'0'; |
| for _ in 0..2 { |
| match self.next_octal_digit() { |
| Err(_) => break, |
| Ok(d) => r = (r << 3) + d as u8, |
| } |
| } |
| Ok(r) |
| } |
| // https://github.com/google/protobuf/issues/4562 |
| // TODO: overflow |
| c => Ok(c as u8), |
| } |
| } |
| '\n' | '\0' => Err(LexerError::IncorrectInput), |
| // TODO: check overflow |
| c => Ok(c as u8), |
| } |
| } |
| |
| fn char_try_from(i: u32) -> LexerResult<char> { |
| char::try_from(i).map_err(|_| LexerError::IncorrectUnicodeChar) |
| } |
| |
| pub fn next_json_char_value(&mut self) -> LexerResult<char> { |
| match self.next_char()? { |
| '\\' => match self.next_char()? { |
| '"' => Ok('"'), |
| '\'' => Ok('\''), |
| '\\' => Ok('\\'), |
| '/' => Ok('/'), |
| 'b' => Ok('\x08'), |
| 'f' => Ok('\x0c'), |
| 'n' => Ok('\n'), |
| 'r' => Ok('\r'), |
| 't' => Ok('\t'), |
| 'u' => { |
| let mut v = 0; |
| for _ in 0..4 { |
| let digit = self.next_hex_digit()?; |
| v = v * 16 + digit; |
| } |
| Self::char_try_from(v) |
| } |
| _ => Err(LexerError::IncorrectJsonEscape), |
| }, |
| c => Ok(c), |
| } |
| } |
| |
| // https://github.com/google/protobuf/issues/4564 |
| // strLit = ( "'" { charValue } "'" ) | ( '"' { charValue } '"' ) |
| fn next_str_lit_raw(&mut self) -> LexerResult<String> { |
| let mut raw = String::new(); |
| |
| let mut first = true; |
| loop { |
| if !first { |
| self.skip_ws()?; |
| } |
| |
| let start = self.pos; |
| |
| let q = match self.next_char_if_in("'\"") { |
| Some(q) => q, |
| None if !first => break, |
| None => return Err(LexerError::IncorrectInput), |
| }; |
| first = false; |
| while self.lookahead_char() != Some(q) { |
| self.next_byte_value()?; |
| } |
| self.next_char_expect_eq(q)?; |
| |
| raw.push_str(&self.input[start + 1..self.pos - 1]); |
| } |
| Ok(raw) |
| } |
| |
| fn next_str_lit_raw_opt(&mut self) -> LexerResult<Option<String>> { |
| if self.lookahead_char_is_in("'\"") { |
| Ok(Some(self.next_str_lit_raw()?)) |
| } else { |
| Ok(None) |
| } |
| } |
| |
| /// Parse next token as JSON number |
| fn next_json_number_opt(&mut self) -> LexerResult<Option<JsonNumberLit>> { |
| assert_eq!(ParserLanguage::Json, self.language); |
| |
| fn is_digit(c: char) -> bool { |
| c >= '0' && c <= '9' |
| } |
| |
| fn is_digit_1_9(c: char) -> bool { |
| c >= '1' && c <= '9' |
| } |
| |
| if !self.lookahead_char_is_in("-0123456789") { |
| return Ok(None); |
| } |
| |
| let mut s = String::new(); |
| if self.next_char_if_eq('-') { |
| s.push('-'); |
| } |
| |
| if self.next_char_if_eq('0') { |
| s.push('0'); |
| } else { |
| s.push(self.next_char_expect(is_digit_1_9, LexerError::IncorrectJsonNumber)?); |
| while let Some(c) = self.next_char_if(is_digit) { |
| s.push(c); |
| } |
| } |
| |
| if self.next_char_if_eq('.') { |
| s.push('.'); |
| s.push(self.next_char_expect(is_digit, LexerError::IncorrectJsonNumber)?); |
| while let Some(c) = self.next_char_if(is_digit) { |
| s.push(c); |
| } |
| } |
| |
| if let Some(c) = self.next_char_if_in("eE") { |
| s.push(c); |
| if let Some(c) = self.next_char_if_in("+-") { |
| s.push(c); |
| } |
| s.push(self.next_char_expect(is_digit_1_9, LexerError::IncorrectJsonNumber)?); |
| while let Some(c) = self.next_char_if(is_digit) { |
| s.push(c); |
| } |
| } |
| |
| Ok(Some(JsonNumberLit(s))) |
| } |
| |
| fn next_token_inner(&mut self) -> LexerResult<Token> { |
| if self.language == ParserLanguage::Json { |
| if let Some(v) = self.next_json_number_opt()? { |
| return Ok(Token::JsonNumber(v)); |
| } |
| } |
| |
| if let Some(ident) = self.next_ident_opt()? { |
| let token = if self.language != ParserLanguage::Json && ident == float::PROTOBUF_NAN { |
| Token::FloatLit(f64::NAN) |
| } else if self.language != ParserLanguage::Json && ident == float::PROTOBUF_INF { |
| Token::FloatLit(f64::INFINITY) |
| } else { |
| Token::Ident(ident.to_owned()) |
| }; |
| return Ok(token); |
| } |
| |
| if self.language != ParserLanguage::Json { |
| let mut clone = self.clone(); |
| let pos = clone.pos; |
| if let Ok(_) = clone.next_float_lit() { |
| let f = float::parse_protobuf_float(&self.input[pos..clone.pos])?; |
| *self = clone; |
| return Ok(Token::FloatLit(f)); |
| } |
| |
| if let Some(lit) = self.next_int_lit_opt()? { |
| return Ok(Token::IntLit(lit)); |
| } |
| } |
| |
| if let Some(escaped) = self.next_str_lit_raw_opt()? { |
| return Ok(Token::StrLit(StrLit { escaped })); |
| } |
| |
| // This branch must be after str lit |
| if let Some(c) = self.next_char_if(|c| c.is_ascii_punctuation()) { |
| return Ok(Token::Symbol(c)); |
| } |
| |
| if let Some(ident) = self.next_ident_opt()? { |
| return Ok(Token::Ident(ident)); |
| } |
| |
| Err(LexerError::IncorrectInput) |
| } |
| |
| pub fn next_token(&mut self) -> LexerResult<Option<TokenWithLocation>> { |
| self.skip_ws()?; |
| let loc = self.loc; |
| |
| Ok(if self.eof() { |
| None |
| } else { |
| let token = self.next_token_inner()?; |
| // Skip whitespace here to update location |
| // to the beginning of the next token |
| self.skip_ws()?; |
| Some(TokenWithLocation { token, loc }) |
| }) |
| } |
| } |
| |
| #[cfg(test)] |
| mod test { |
| use super::*; |
| |
| fn lex<P, R>(input: &str, parse_what: P) -> R |
| where |
| P: FnOnce(&mut Lexer) -> LexerResult<R>, |
| { |
| let mut lexer = Lexer::new(input, ParserLanguage::Proto); |
| let r = parse_what(&mut lexer).expect(&format!("lexer failed at {}", lexer.loc)); |
| assert!(lexer.eof(), "check eof failed at {}", lexer.loc); |
| r |
| } |
| |
| fn lex_opt<P, R>(input: &str, parse_what: P) -> R |
| where |
| P: FnOnce(&mut Lexer) -> LexerResult<Option<R>>, |
| { |
| let mut lexer = Lexer::new(input, ParserLanguage::Proto); |
| let o = parse_what(&mut lexer).expect(&format!("lexer failed at {}", lexer.loc)); |
| let r = o.expect(&format!("lexer returned none at {}", lexer.loc)); |
| assert!(lexer.eof(), "check eof failed at {}", lexer.loc); |
| r |
| } |
| |
| #[test] |
| fn test_lexer_int_lit() { |
| let msg = r#"10"#; |
| let mess = lex_opt(msg, |p| p.next_int_lit_opt()); |
| assert_eq!(10, mess); |
| } |
| |
| #[test] |
| fn test_lexer_float_lit() { |
| let msg = r#"12.3"#; |
| let mess = lex(msg, |p| p.next_token_inner()); |
| assert_eq!(Token::FloatLit(12.3), mess); |
| } |
| } |