blob: 5958658b797a575aa21417195c7eedcbf25c8dbf [file] [log] [blame]
//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
//
// Copyright 2019 The MLIR Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
//
// This file implements the lexer for the MLIR textual form.
//
//===----------------------------------------------------------------------===//
#include "Lexer.h"
#include "llvm/Support/SourceMgr.h"
using namespace mlir;
using llvm::SMLoc;
using llvm::SourceMgr;
Lexer::Lexer(llvm::SourceMgr &sourceMgr) : sourceMgr(sourceMgr) {
auto bufferID = sourceMgr.getMainFileID();
curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
curPtr = curBuffer.begin();
}
/// emitError - Emit an error message and return an Token::error token.
Token Lexer::emitError(const char *loc, const Twine &message) {
// TODO(clattner): If/when we want to implement a -verify mode, this will need
// to package up errors into SMDiagnostic and report them.
sourceMgr.PrintMessage(SMLoc::getFromPointer(loc), SourceMgr::DK_Error,
message);
return formToken(Token::error, loc);
}
Token Lexer::lexToken() {
const char *tokStart = curPtr;
switch (*curPtr++) {
default:
// Handle bare identifiers.
if (isalpha(curPtr[-1]))
return lexBareIdentifierOrKeyword(tokStart);
// Unknown character, emit an error.
return emitError(tokStart, "unexpected character");
case 0:
// This may either be a nul character in the source file or may be the EOF
// marker that llvm::MemoryBuffer guarantees will be there.
if (curPtr-1 == curBuffer.end())
return formToken(Token::eof, tokStart);
LLVM_FALLTHROUGH;
case ' ':
case '\t':
case '\n':
case '\r':
// Ignore whitespace.
return lexToken();
case '(': return formToken(Token::l_paren, tokStart);
case ')': return formToken(Token::r_paren, tokStart);
case '<': return formToken(Token::less, tokStart);
case '>': return formToken(Token::greater, tokStart);
case ';': return lexComment();
case '@': return lexAtIdentifier(tokStart);
}
}
/// Lex a comment line, starting with a semicolon.
///
/// TODO: add a regex for comments here and to the spec.
///
Token Lexer::lexComment() {
while (true) {
switch (*curPtr++) {
case '\n':
case '\r':
// Newline is end of comment.
return lexToken();
case 0:
// If this is the end of the buffer, end the comment.
if (curPtr-1 == curBuffer.end()) {
--curPtr;
return lexToken();
}
LLVM_FALLTHROUGH;
default:
// Skip over other characters.
break;
}
}
}
/// Lex a bare identifier or keyword that starts with a letter.
///
/// bare-id ::= letter (letter|digit)*
///
Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
// Match the rest of the identifier regex: [0-9a-zA-Z]*
while (isalpha(*curPtr) || isdigit(*curPtr))
++curPtr;
// Check to see if this identifier is a keyword.
StringRef spelling(tokStart, curPtr-tokStart);
Token::TokenKind kind = llvm::StringSwitch<Token::TokenKind>(spelling)
.Case("cfgfunc", Token::kw_cfgfunc)
.Case("extfunc", Token::kw_extfunc)
.Case("mlfunc", Token::kw_mlfunc)
.Default(Token::bare_identifier);
return Token(kind, spelling);
}
/// Lex an '@foo' identifier.
///
/// function-id ::= `@` bare-id
///
Token Lexer::lexAtIdentifier(const char *tokStart) {
// These always start with a letter.
if (!isalpha(*curPtr++))
return emitError(curPtr-1, "expected letter in @ identifier");
while (isalpha(*curPtr) || isdigit(*curPtr))
++curPtr;
return formToken(Token::at_identifier, tokStart);
}