vendor/cranelift-isle/src/lexer.rs - toolchain/rustc - Git at Google

 //! Lexer for the ISLE language.

 use crate::error::{Error, Errors, Span};
 use std::borrow::Cow;
 use std::path::Path;
 use std::sync::Arc;

 type Result<T> = std::result::Result<T, Errors>;

 /// The lexer.
 ///
 /// Breaks source text up into a sequence of tokens (with source positions).
 #[derive(Clone, Debug)]
 pub struct Lexer<'a> {
     /// Arena of filenames from the input source.
     ///
     /// Indexed via `Pos::file`.
     pub filenames: Vec<Arc<str>>,

     /// Arena of file source texts.
     ///
     /// Indexed via `Pos::file`.
     pub file_texts: Vec<Arc<str>>,

     file_starts: Vec<usize>,
     buf: Cow<'a, [u8]>,
     pos: Pos,
     lookahead: Option<(Pos, Token)>,
 }

 /// A source position.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Default, Hash, PartialOrd, Ord)]
 pub struct Pos {
     /// This source position's file.
     ///
     /// Indexes into `Lexer::filenames` early in the compiler pipeline, and
     /// later into `TypeEnv::filenames` once we get into semantic analysis.
     pub file: usize,
     /// This source position's byte offset in the file.
     pub offset: usize,
     /// This source position's line number in the file.
     pub line: usize,
     /// This source position's column number in the file.
     pub col: usize,
 }

 impl Pos {
     /// Print this source position as `file.isle line 12`.
     pub fn pretty_print_line(&self, filenames: &[Arc<str>]) -> String {
         format!("{} line {}", filenames[self.file], self.line)
     }
 }

 /// A token of ISLE source.
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub enum Token {
     /// Left paren.
     LParen,
     /// Right paren.
     RParen,
     /// A symbol, e.g. `Foo`.
     Symbol(String),
     /// An integer.
     Int(i128),
     /// `@`
     At,
 }

 impl<'a> Lexer<'a> {
     /// Create a new lexer for the given source contents and filename.
     pub fn from_str(s: &'a str, filename: &'a str) -> Result<Lexer<'a>> {
         let mut l = Lexer {
             filenames: vec![filename.into()],
             file_texts: vec![s.into()],
             file_starts: vec![0],
             buf: Cow::Borrowed(s.as_bytes()),
             pos: Pos {
                 file: 0,
                 offset: 0,
                 line: 1,
                 col: 0,
             },
             lookahead: None,
         };
         l.reload()?;
         Ok(l)
     }

     /// Create a new lexer from the given files.
     pub fn from_files<P>(file_paths: impl IntoIterator<Item = P>) -> Result<Lexer<'a>>
     where
         P: AsRef<Path>,
     {
         let mut filenames = Vec::<Arc<str>>::new();
         let mut file_texts = Vec::<Arc<str>>::new();

         for f in file_paths {
             let f = f.as_ref();

             filenames.push(f.display().to_string().into());

             let s = std::fs::read_to_string(f)
                 .map_err(|e| Errors::from_io(e, format!("failed to read file: {}", f.display())))?;
             file_texts.push(s.into());
         }

         assert!(!filenames.is_empty());

         let mut file_starts = vec![];
         let mut buf = String::new();
         for text in &file_texts {
             file_starts.push(buf.len());
             buf += &text;
             buf += "\n";
         }

         let mut l = Lexer {
             filenames,
             file_texts,
             buf: Cow::Owned(buf.into_bytes()),
             file_starts,
             pos: Pos {
                 file: 0,
                 offset: 0,
                 line: 1,
                 col: 0,
             },
             lookahead: None,
         };
         l.reload()?;
         Ok(l)
     }

     /// Get the lexer's current source position.
     pub fn pos(&self) -> Pos {
         Pos {
             file: self.pos.file,
             offset: self.pos.offset - self.file_starts[self.pos.file],
             line: self.pos.line,
             col: self.pos.file,
         }
     }

     fn advance_pos(&mut self) {
         self.pos.col += 1;
         if self.buf[self.pos.offset] == b'\n' {
             self.pos.line += 1;
             self.pos.col = 0;
         }
         self.pos.offset += 1;
         if self.pos.file + 1 < self.file_starts.len() {
             let next_start = self.file_starts[self.pos.file + 1];
             if self.pos.offset >= next_start {
                 assert!(self.pos.offset == next_start);
                 self.pos.file += 1;
                 self.pos.line = 1;
             }
         }
     }

     fn error(&self, pos: Pos, msg: impl Into<String>) -> Errors {
         Errors {
             errors: vec![Error::ParseError {
                 msg: msg.into(),
                 span: Span::new_single(pos),
             }],
             filenames: self.filenames.clone(),
             file_texts: self.file_texts.clone(),
         }
     }

     fn next_token(&mut self) -> Result<Option<(Pos, Token)>> {
         fn is_sym_first_char(c: u8) -> bool {
             match c {
                 b'-' | b'0'..=b'9' | b'(' | b')' | b';' | b'<' | b'>' => false,
                 c if c.is_ascii_whitespace() => false,
                 _ => true,
             }
         }
         fn is_sym_other_char(c: u8) -> bool {
             match c {
                 b'(' | b')' | b';' | b'@' | b'<' => false,
                 c if c.is_ascii_whitespace() => false,
                 _ => true,
             }
         }

         // Skip any whitespace and any comments.
         while self.pos.offset < self.buf.len() {
             if self.buf[self.pos.offset].is_ascii_whitespace() {
                 self.advance_pos();
                 continue;
             }
             if self.buf[self.pos.offset] == b';' {
                 while self.pos.offset < self.buf.len() && self.buf[self.pos.offset] != b'\n' {
                     self.advance_pos();
                 }
                 continue;
             }
             break;
         }

         if self.pos.offset == self.buf.len() {
             return Ok(None);
         }

         let char_pos = self.pos();
         match self.buf[self.pos.offset] {
             b'(' => {
                 self.advance_pos();
                 Ok(Some((char_pos, Token::LParen)))
             }
             b')' => {
                 self.advance_pos();
                 Ok(Some((char_pos, Token::RParen)))
             }
             b'@' => {
                 self.advance_pos();
                 Ok(Some((char_pos, Token::At)))
             }
             c if is_sym_first_char(c) => {
                 let start = self.pos.offset;
                 let start_pos = self.pos();
                 while self.pos.offset < self.buf.len()
                     && is_sym_other_char(self.buf[self.pos.offset])
                 {
                     self.advance_pos();
                 }
                 let end = self.pos.offset;
                 let s = std::str::from_utf8(&self.buf[start..end])
                     .expect("Only ASCII characters, should be UTF-8");
                 debug_assert!(!s.is_empty());
                 Ok(Some((start_pos, Token::Symbol(s.to_string()))))
             }
             c if (c >= b'0' && c <= b'9') || c == b'-' => {
                 let start_pos = self.pos();
                 let neg = if c == b'-' {
                     self.advance_pos();
                     true
                 } else {
                     false
                 };

                 let mut radix = 10;

                 // Check for hex literals.
                 if self.buf.get(self.pos.offset).copied() == Some(b'0')
                     && (self.buf.get(self.pos.offset + 1).copied() == Some(b'x')
                         || self.buf.get(self.pos.offset + 1).copied() == Some(b'X'))
                 {
                     self.advance_pos();
                     self.advance_pos();
                     radix = 16;
                 }

                 // Find the range in the buffer for this integer literal. We'll
                 // pass this range to `i64::from_str_radix` to do the actual
                 // string-to-integer conversion.
                 let mut s = vec![];
                 while self.pos.offset < self.buf.len()
                     && ((radix == 10
                         && self.buf[self.pos.offset] >= b'0'
                         && self.buf[self.pos.offset] <= b'9')
                         || (radix == 16
                             && ((self.buf[self.pos.offset] >= b'0'
                                 && self.buf[self.pos.offset] <= b'9')
                                 || (self.buf[self.pos.offset] >= b'a'
                                     && self.buf[self.pos.offset] <= b'f')
                                 || (self.buf[self.pos.offset] >= b'A'
                                     && self.buf[self.pos.offset] <= b'F')))
                         || self.buf[self.pos.offset] == b'_')
                 {
                     if self.buf[self.pos.offset] != b'_' {
                         s.push(self.buf[self.pos.offset]);
                     }
                     self.advance_pos();
                 }
                 let s_utf8 = std::str::from_utf8(&s[..]).unwrap();

                 // Support either signed range (-2^127..2^127) or
                 // unsigned range (0..2^128).
                 let num = i128::from_str_radix(s_utf8, radix)
                     .or_else(|_| u128::from_str_radix(s_utf8, radix).map(|val| val as i128))
                     .map_err(|e| self.error(start_pos, e.to_string()))?;

                 let tok = if neg {
                     Token::Int(num.checked_neg().ok_or_else(|| {
                         self.error(start_pos, "integer literal cannot fit in i128")
                     })?)
                 } else {
                     Token::Int(num)
                 };
                 Ok(Some((start_pos, tok)))
             }
             c => Err(self.error(self.pos, format!("Unexpected character '{}'", c))),
         }
     }

     /// Get the next token from this lexer's token stream, if any.
     pub fn next(&mut self) -> Result<Option<(Pos, Token)>> {
         let tok = self.lookahead.take();
         self.reload()?;
         Ok(tok)
     }

     fn reload(&mut self) -> Result<()> {
         if self.lookahead.is_none() && self.pos.offset < self.buf.len() {
             self.lookahead = self.next_token()?;
         }
         Ok(())
     }

     /// Peek ahead at the next token.
     pub fn peek(&self) -> Option<&(Pos, Token)> {
         self.lookahead.as_ref()
     }

     /// Are we at the end of the source input?
     pub fn eof(&self) -> bool {
         self.lookahead.is_none()
     }
 }

 impl Token {
     /// Is this an `Int` token?
     pub fn is_int(&self) -> bool {
         match self {
             Token::Int(_) => true,
             _ => false,
         }
     }

     /// Is this a `Sym` token?
     pub fn is_sym(&self) -> bool {
         match self {
             Token::Symbol(_) => true,
             _ => false,
         }
     }
 }

 #[cfg(test)]
 mod test {
     use super::*;

     fn lex(s: &str, file: &str) -> Vec<Token> {
         let mut toks = vec![];
         let mut lexer = Lexer::from_str(s, file).unwrap();
         while let Some((_, tok)) = lexer.next().unwrap() {
             toks.push(tok);
         }
         toks
     }

     #[test]
     fn lexer_basic() {
         assert_eq!(
             lex(
                 ";; comment\n; another\r\n   \t(one two three 23 -568  )\n",
                 "lexer_basic"
             ),
             vec![
                 Token::LParen,
                 Token::Symbol("one".to_string()),
                 Token::Symbol("two".to_string()),
                 Token::Symbol("three".to_string()),
                 Token::Int(23),
                 Token::Int(-568),
                 Token::RParen
             ]
         );
     }

     #[test]
     fn ends_with_sym() {
         assert_eq!(
             lex("asdf", "ends_with_sym"),
             vec![Token::Symbol("asdf".to_string()),]
         );
     }

     #[test]
     fn ends_with_num() {
         assert_eq!(lex("23", "ends_with_num"), vec![Token::Int(23)],);
     }

     #[test]
     fn weird_syms() {
         assert_eq!(
             lex("(+ [] => !! _test!;comment\n)", "weird_syms"),
             vec![
                 Token::LParen,
                 Token::Symbol("+".to_string()),
                 Token::Symbol("[]".to_string()),
                 Token::Symbol("=>".to_string()),
                 Token::Symbol("!!".to_string()),
                 Token::Symbol("_test!".to_string()),
                 Token::RParen,
             ]
         );
     }
 }
	//! Lexer for the ISLE language.

	use crate::error::{Error, Errors, Span};
	use std::borrow::Cow;
	use std::path::Path;
	use std::sync::Arc;

	type Result<T> = std::result::Result<T, Errors>;

	/// The lexer.
	///
	/// Breaks source text up into a sequence of tokens (with source positions).
	#[derive(Clone, Debug)]
	pub struct Lexer<'a> {
	/// Arena of filenames from the input source.
	///
	/// Indexed via `Pos::file`.
	pub filenames: Vec<Arc<str>>,

	/// Arena of file source texts.
	///
	/// Indexed via `Pos::file`.
	pub file_texts: Vec<Arc<str>>,

	file_starts: Vec<usize>,
	buf: Cow<'a, [u8]>,
	pos: Pos,
	lookahead: Option<(Pos, Token)>,
	}

	/// A source position.
	#[derive(Clone, Copy, Debug, PartialEq, Eq, Default, Hash, PartialOrd, Ord)]
	pub struct Pos {
	/// This source position's file.
	///
	/// Indexes into `Lexer::filenames` early in the compiler pipeline, and
	/// later into `TypeEnv::filenames` once we get into semantic analysis.
	pub file: usize,
	/// This source position's byte offset in the file.
	pub offset: usize,
	/// This source position's line number in the file.
	pub line: usize,
	/// This source position's column number in the file.
	pub col: usize,
	}

	impl Pos {
	/// Print this source position as `file.isle line 12`.
	pub fn pretty_print_line(&self, filenames: &[Arc<str>]) -> String {
	format!("{} line {}", filenames[self.file], self.line)
	}
	}

	/// A token of ISLE source.
	#[derive(Clone, Debug, PartialEq, Eq)]
	pub enum Token {
	/// Left paren.
	LParen,
	/// Right paren.
	RParen,
	/// A symbol, e.g. `Foo`.
	Symbol(String),
	/// An integer.
	Int(i128),
	/// `@`
	At,
	}

	impl<'a> Lexer<'a> {
	/// Create a new lexer for the given source contents and filename.
	pub fn from_str(s: &'a str, filename: &'a str) -> Result<Lexer<'a>> {
	let mut l = Lexer {
	filenames: vec![filename.into()],
	file_texts: vec![s.into()],
	file_starts: vec![0],
	buf: Cow::Borrowed(s.as_bytes()),
	pos: Pos {
	file: 0,
	offset: 0,
	line: 1,
	col: 0,
	},
	lookahead: None,
	};
	l.reload()?;
	Ok(l)
	}

	/// Create a new lexer from the given files.
	pub fn from_files<P>(file_paths: impl IntoIterator<Item = P>) -> Result<Lexer<'a>>
	where
	P: AsRef<Path>,
	{
	let mut filenames = Vec::<Arc<str>>::new();
	let mut file_texts = Vec::<Arc<str>>::new();

	for f in file_paths {
	let f = f.as_ref();

	filenames.push(f.display().to_string().into());

	let s = std::fs::read_to_string(f)
	.map_err(\|e\| Errors::from_io(e, format!("failed to read file: {}", f.display())))?;
	file_texts.push(s.into());
	}

	assert!(!filenames.is_empty());

	let mut file_starts = vec![];
	let mut buf = String::new();
	for text in &file_texts {
	file_starts.push(buf.len());
	buf += &text;
	buf += "\n";
	}

	let mut l = Lexer {
	filenames,
	file_texts,
	buf: Cow::Owned(buf.into_bytes()),
	file_starts,
	pos: Pos {
	file: 0,
	offset: 0,
	line: 1,
	col: 0,
	},
	lookahead: None,
	};
	l.reload()?;
	Ok(l)
	}

	/// Get the lexer's current source position.
	pub fn pos(&self) -> Pos {
	Pos {
	file: self.pos.file,
	offset: self.pos.offset - self.file_starts[self.pos.file],
	line: self.pos.line,
	col: self.pos.file,
	}
	}

	fn advance_pos(&mut self) {
	self.pos.col += 1;
	if self.buf[self.pos.offset] == b'\n' {
	self.pos.line += 1;
	self.pos.col = 0;
	}
	self.pos.offset += 1;
	if self.pos.file + 1 < self.file_starts.len() {
	let next_start = self.file_starts[self.pos.file + 1];
	if self.pos.offset >= next_start {
	assert!(self.pos.offset == next_start);
	self.pos.file += 1;
	self.pos.line = 1;
	}
	}
	}

	fn error(&self, pos: Pos, msg: impl Into<String>) -> Errors {
	Errors {
	errors: vec![Error::ParseError {
	msg: msg.into(),
	span: Span::new_single(pos),
	}],
	filenames: self.filenames.clone(),
	file_texts: self.file_texts.clone(),
	}
	}

	fn next_token(&mut self) -> Result<Option<(Pos, Token)>> {
	fn is_sym_first_char(c: u8) -> bool {
	match c {
	b'-' \| b'0'..=b'9' \| b'(' \| b')' \| b';' \| b'<' \| b'>' => false,
	c if c.is_ascii_whitespace() => false,
	_ => true,
	}
	}
	fn is_sym_other_char(c: u8) -> bool {
	match c {
	b'(' \| b')' \| b';' \| b'@' \| b'<' => false,
	c if c.is_ascii_whitespace() => false,
	_ => true,
	}
	}

	// Skip any whitespace and any comments.
	while self.pos.offset < self.buf.len() {
	if self.buf[self.pos.offset].is_ascii_whitespace() {
	self.advance_pos();
	continue;
	}
	if self.buf[self.pos.offset] == b';' {
	while self.pos.offset < self.buf.len() && self.buf[self.pos.offset] != b'\n' {
	self.advance_pos();
	}
	continue;
	}
	break;
	}

	if self.pos.offset == self.buf.len() {
	return Ok(None);
	}

	let char_pos = self.pos();
	match self.buf[self.pos.offset] {
	b'(' => {
	self.advance_pos();
	Ok(Some((char_pos, Token::LParen)))
	}
	b')' => {
	self.advance_pos();
	Ok(Some((char_pos, Token::RParen)))
	}
	b'@' => {
	self.advance_pos();
	Ok(Some((char_pos, Token::At)))
	}
	c if is_sym_first_char(c) => {
	let start = self.pos.offset;
	let start_pos = self.pos();
	while self.pos.offset < self.buf.len()
	&& is_sym_other_char(self.buf[self.pos.offset])
	{
	self.advance_pos();
	}
	let end = self.pos.offset;
	let s = std::str::from_utf8(&self.buf[start..end])
	.expect("Only ASCII characters, should be UTF-8");
	debug_assert!(!s.is_empty());
	Ok(Some((start_pos, Token::Symbol(s.to_string()))))
	}
	c if (c >= b'0' && c <= b'9') \|\| c == b'-' => {
	let start_pos = self.pos();
	let neg = if c == b'-' {
	self.advance_pos();
	true
	} else {
	false
	};

	let mut radix = 10;

	// Check for hex literals.
	if self.buf.get(self.pos.offset).copied() == Some(b'0')
	&& (self.buf.get(self.pos.offset + 1).copied() == Some(b'x')
	\|\| self.buf.get(self.pos.offset + 1).copied() == Some(b'X'))
	{
	self.advance_pos();
	self.advance_pos();
	radix = 16;
	}

	// Find the range in the buffer for this integer literal. We'll
	// pass this range to `i64::from_str_radix` to do the actual
	// string-to-integer conversion.
	let mut s = vec![];
	while self.pos.offset < self.buf.len()
	&& ((radix == 10
	&& self.buf[self.pos.offset] >= b'0'
	&& self.buf[self.pos.offset] <= b'9')
	\|\| (radix == 16
	&& ((self.buf[self.pos.offset] >= b'0'
	&& self.buf[self.pos.offset] <= b'9')
	\|\| (self.buf[self.pos.offset] >= b'a'
	&& self.buf[self.pos.offset] <= b'f')
	\|\| (self.buf[self.pos.offset] >= b'A'
	&& self.buf[self.pos.offset] <= b'F')))
	\|\| self.buf[self.pos.offset] == b'_')
	{
	if self.buf[self.pos.offset] != b'_' {
	s.push(self.buf[self.pos.offset]);
	}
	self.advance_pos();
	}
	let s_utf8 = std::str::from_utf8(&s[..]).unwrap();

	// Support either signed range (-2^127..2^127) or
	// unsigned range (0..2^128).
	let num = i128::from_str_radix(s_utf8, radix)
	.or_else(\|_\| u128::from_str_radix(s_utf8, radix).map(\|val\| val as i128))
	.map_err(\|e\| self.error(start_pos, e.to_string()))?;

	let tok = if neg {
	Token::Int(num.checked_neg().ok_or_else(\|\| {
	self.error(start_pos, "integer literal cannot fit in i128")
	})?)
	} else {
	Token::Int(num)
	};
	Ok(Some((start_pos, tok)))
	}
	c => Err(self.error(self.pos, format!("Unexpected character '{}'", c))),
	}
	}

	/// Get the next token from this lexer's token stream, if any.
	pub fn next(&mut self) -> Result<Option<(Pos, Token)>> {
	let tok = self.lookahead.take();
	self.reload()?;
	Ok(tok)
	}

	fn reload(&mut self) -> Result<()> {
	if self.lookahead.is_none() && self.pos.offset < self.buf.len() {
	self.lookahead = self.next_token()?;
	}
	Ok(())
	}

	/// Peek ahead at the next token.
	pub fn peek(&self) -> Option<&(Pos, Token)> {
	self.lookahead.as_ref()
	}

	/// Are we at the end of the source input?
	pub fn eof(&self) -> bool {
	self.lookahead.is_none()
	}
	}

	impl Token {
	/// Is this an `Int` token?
	pub fn is_int(&self) -> bool {
	match self {
	Token::Int(_) => true,
	_ => false,
	}
	}

	/// Is this a `Sym` token?
	pub fn is_sym(&self) -> bool {
	match self {
	Token::Symbol(_) => true,
	_ => false,
	}
	}
	}

	#[cfg(test)]
	mod test {
	use super::*;

	fn lex(s: &str, file: &str) -> Vec<Token> {
	let mut toks = vec![];
	let mut lexer = Lexer::from_str(s, file).unwrap();
	while let Some((_, tok)) = lexer.next().unwrap() {
	toks.push(tok);
	}
	toks
	}

	#[test]
	fn lexer_basic() {
	assert_eq!(
	lex(
	";; comment\n; another\r\n \t(one two three 23 -568 )\n",
	"lexer_basic"
	),
	vec![
	Token::LParen,
	Token::Symbol("one".to_string()),
	Token::Symbol("two".to_string()),
	Token::Symbol("three".to_string()),
	Token::Int(23),
	Token::Int(-568),
	Token::RParen
	]
	);
	}

	#[test]
	fn ends_with_sym() {
	assert_eq!(
	lex("asdf", "ends_with_sym"),
	vec![Token::Symbol("asdf".to_string()),]
	);
	}

	#[test]
	fn ends_with_num() {
	assert_eq!(lex("23", "ends_with_num"), vec![Token::Int(23)],);
	}

	#[test]
	fn weird_syms() {
	assert_eq!(
	lex("(+ [] => !! _test!;comment\n)", "weird_syms"),
	vec![
	Token::LParen,
	Token::Symbol("+".to_string()),
	Token::Symbol("[]".to_string()),
	Token::Symbol("=>".to_string()),
	Token::Symbol("!!".to_string()),
	Token::Symbol("_test!".to_string()),
	Token::RParen,
	]
	);
	}
	}