Implement enough of a lexer and parser for MLIR to parse extfunc's without
arguments.

PiperOrigin-RevId: 201706570
diff --git a/include/mlir/Parser.h b/include/mlir/Parser.h
new file mode 100644
index 0000000..cb5f1c0
--- /dev/null
+++ b/include/mlir/Parser.h
@@ -0,0 +1,38 @@
+//===- Parser.h - MLIR Parser Library Interface -----------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file is contains the interface to the MLIR parser library.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_PARSER_H
+#define MLIR_PARSER_H
+
+namespace llvm {
+  class SourceMgr;
+}
+
+namespace mlir {
+class Module;
+
+/// This parses the file specified by the indicated SourceMgr and returns an
+/// MLIR module if it was valid.  If not, it emits diagnostics and returns null.
+Module *parseSourceFile(llvm::SourceMgr &sourceMgr);
+
+} // end namespace mlir
+
+#endif // MLIR_PARSER_H
diff --git a/lib/Parser/Lexer.cpp b/lib/Parser/Lexer.cpp
new file mode 100644
index 0000000..5958658
--- /dev/null
+++ b/lib/Parser/Lexer.cpp
@@ -0,0 +1,137 @@
+//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements the lexer for the MLIR textual form.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lexer.h"
+#include "llvm/Support/SourceMgr.h"
+using namespace mlir;
+using llvm::SMLoc;
+using llvm::SourceMgr;
+
+Lexer::Lexer(llvm::SourceMgr &sourceMgr) : sourceMgr(sourceMgr) {
+  auto bufferID = sourceMgr.getMainFileID();
+  curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
+  curPtr = curBuffer.begin();
+}
+
+/// emitError - Emit an error message and return an Token::error token.
+Token Lexer::emitError(const char *loc, const Twine &message) {
+  // TODO(clattner): If/when we want to implement a -verify mode, this will need
+  // to package up errors into SMDiagnostic and report them.
+  sourceMgr.PrintMessage(SMLoc::getFromPointer(loc), SourceMgr::DK_Error,
+                         message);
+  return formToken(Token::error, loc);
+}
+
+Token Lexer::lexToken() {
+  const char *tokStart = curPtr;
+
+  switch (*curPtr++) {
+  default:
+    // Handle bare identifiers.
+    if (isalpha(curPtr[-1]))
+      return lexBareIdentifierOrKeyword(tokStart);
+
+    // Unknown character, emit an error.
+    return emitError(tokStart, "unexpected character");
+
+  case 0:
+    // This may either be a nul character in the source file or may be the EOF
+    // marker that llvm::MemoryBuffer guarantees will be there.
+    if (curPtr-1 == curBuffer.end())
+      return formToken(Token::eof, tokStart);
+
+    LLVM_FALLTHROUGH;
+  case ' ':
+  case '\t':
+  case '\n':
+  case '\r':
+    // Ignore whitespace.
+    return lexToken();
+
+  case '(': return formToken(Token::l_paren, tokStart);
+  case ')': return formToken(Token::r_paren, tokStart);
+  case '<': return formToken(Token::less, tokStart);
+  case '>': return formToken(Token::greater, tokStart);
+
+  case ';': return lexComment();
+  case '@': return lexAtIdentifier(tokStart);
+  }
+}
+
+/// Lex a comment line, starting with a semicolon.
+///
+///   TODO: add a regex for comments here and to the spec.
+///
+Token Lexer::lexComment() {
+  while (true) {
+    switch (*curPtr++) {
+    case '\n':
+    case '\r':
+      // Newline is end of comment.
+      return lexToken();
+    case 0:
+      // If this is the end of the buffer, end the comment.
+      if (curPtr-1 == curBuffer.end()) {
+        --curPtr;
+        return lexToken();
+      }
+      LLVM_FALLTHROUGH;
+    default:
+      // Skip over other characters.
+      break;
+    }
+  }
+}
+
+/// Lex a bare identifier or keyword that starts with a letter.
+///
+///   bare-id ::= letter (letter|digit)*
+///
+Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
+  // Match the rest of the identifier regex: [0-9a-zA-Z]*
+  while (isalpha(*curPtr) || isdigit(*curPtr))
+    ++curPtr;
+
+  // Check to see if this identifier is a keyword.
+  StringRef spelling(tokStart, curPtr-tokStart);
+
+  Token::TokenKind kind = llvm::StringSwitch<Token::TokenKind>(spelling)
+    .Case("cfgfunc", Token::kw_cfgfunc)
+    .Case("extfunc", Token::kw_extfunc)
+    .Case("mlfunc", Token::kw_mlfunc)
+    .Default(Token::bare_identifier);
+
+  return Token(kind, spelling);
+}
+
+/// Lex an '@foo' identifier.
+///
+///   function-id ::= `@` bare-id
+///
+Token Lexer::lexAtIdentifier(const char *tokStart) {
+  // These always start with a letter.
+  if (!isalpha(*curPtr++))
+    return emitError(curPtr-1, "expected letter in @ identifier");
+
+  while (isalpha(*curPtr) || isdigit(*curPtr))
+    ++curPtr;
+  return formToken(Token::at_identifier, tokStart);
+}
diff --git a/lib/Parser/Lexer.h b/lib/Parser/Lexer.h
new file mode 100644
index 0000000..5886c5c
--- /dev/null
+++ b/lib/Parser/Lexer.h
@@ -0,0 +1,65 @@
+//===- Lexer.h - MLIR Lexer Interface ---------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file declares the MLIR Lexer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_LIB_PARSER_LEXER_H
+#define MLIR_LIB_PARSER_LEXER_H
+
+#include "Token.h"
+
+namespace llvm {
+  class SourceMgr;
+}
+
+namespace mlir {
+
+/// This class breaks up the current file into a token stream.
+class Lexer {
+  llvm::SourceMgr &sourceMgr;
+
+  StringRef curBuffer;
+  const char *curPtr;
+
+  Lexer(const Lexer&) = delete;
+  void operator=(const Lexer&) = delete;
+public:
+  explicit Lexer(llvm::SourceMgr &sourceMgr);
+
+  llvm::SourceMgr &getSourceMgr() { return sourceMgr; }
+
+  Token lexToken();
+
+private:
+  // Helpers.
+  Token formToken(Token::TokenKind kind, const char *tokStart) {
+    return Token(kind, StringRef(tokStart, curPtr-tokStart));
+  }
+
+  Token emitError(const char *loc, const Twine &message);
+
+  // Lexer implementation methods.
+  Token lexComment();
+  Token lexBareIdentifierOrKeyword(const char *tokStart);
+  Token lexAtIdentifier(const char *tokStart);
+};
+
+} // end namespace mlir
+
+#endif  // MLIR_LIB_PARSER_LEXER_H
diff --git a/lib/Parser/Parser.cpp b/lib/Parser/Parser.cpp
new file mode 100644
index 0000000..abad611
--- /dev/null
+++ b/lib/Parser/Parser.cpp
@@ -0,0 +1,186 @@
+//===- Parser.cpp - MLIR Parser Implementation ----------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements the parser for the MLIR textual form.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Parser.h"
+#include "Lexer.h"
+#include "mlir/IR/Module.h"
+#include "llvm/Support/SourceMgr.h"
+using namespace mlir;
+using llvm::SourceMgr;
+
+namespace {
+/// Simple enum to make code read better.  Failure is "true" in a boolean
+/// context.
+enum ParseResult {
+  ParseSuccess,
+  ParseFailure
+};
+
+/// Main parser implementation.
+class Parser {
+ public:
+  Parser(llvm::SourceMgr &sourceMgr) : lex(sourceMgr), curToken(lex.lexToken()){
+    module.reset(new Module());
+  }
+
+  Module *parseModule();
+private:
+  // State.
+  Lexer lex;
+
+  // This is the next token that hasn't been consumed yet.
+  Token curToken;
+
+  // This is the result module we are parsing into.
+  std::unique_ptr<Module> module;
+
+private:
+  // Helper methods.
+
+  /// Emit an error and return failure.
+  ParseResult emitError(const Twine &message);
+
+  /// Advance the current lexer onto the next token.
+  void consumeToken() {
+    assert(curToken.isNot(Token::eof, Token::error) &&
+           "shouldn't advance past EOF or errors");
+    curToken = lex.lexToken();
+  }
+
+  /// Advance the current lexer onto the next token, asserting what the expected
+  /// current token is.  This is preferred to the above method because it leads
+  /// to more self-documenting code with better checking.
+  void consumeToken(Token::TokenKind kind) {
+    assert(curToken.is(kind) && "consumed an unexpected token");
+    consumeToken();
+  }
+
+  // Type parsing.
+
+  // Top level entity parsing.
+  ParseResult parseFunctionSignature(StringRef &name);
+  ParseResult parseExtFunc();
+};
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Helper methods.
+//===----------------------------------------------------------------------===//
+
+ParseResult Parser::emitError(const Twine &message) {
+  // TODO(clattner): If/when we want to implement a -verify mode, this will need
+  // to package up errors into SMDiagnostic and report them.
+  lex.getSourceMgr().PrintMessage(curToken.getLoc(), SourceMgr::DK_Error,
+                                  message);
+  return ParseFailure;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Type Parsing
+//===----------------------------------------------------------------------===//
+
+// ... TODO
+
+//===----------------------------------------------------------------------===//
+// Top-level entity parsing.
+//===----------------------------------------------------------------------===//
+
+/// Parse a function signature, starting with a name and including the parameter
+/// list.
+///
+///   argument-list ::= type (`,` type)* | /*empty*/
+///   function-signature ::= function-id `(` argument-list `)` (`->` type-list)?
+///
+ParseResult Parser::parseFunctionSignature(StringRef &name) {
+  if (curToken.isNot(Token::at_identifier))
+    return emitError("expected a function identifier like '@foo'");
+
+  name = curToken.getSpelling().drop_front();
+  consumeToken(Token::at_identifier);
+
+  if (curToken.isNot(Token::l_paren))
+    return emitError("expected '(' in function signature");
+  consumeToken(Token::l_paren);
+
+  // TODO: This should actually parse the full grammar here.
+
+  if (curToken.isNot(Token::r_paren))
+    return emitError("expected ')' in function signature");
+  consumeToken(Token::r_paren);
+
+  return ParseSuccess;
+}
+
+
+/// External function declarations.
+///
+///   ext-func ::= `extfunc` function-signature
+///
+ParseResult Parser::parseExtFunc() {
+  consumeToken(Token::kw_extfunc);
+
+  StringRef name;
+  if (parseFunctionSignature(name))
+    return ParseFailure;
+
+
+  // Okay, the external function definition was parsed correctly.
+  module->functionList.push_back(new Function(name));
+  return ParseSuccess;
+}
+
+
+/// This is the top-level module parser.
+Module *Parser::parseModule() {
+  while (1) {
+    switch (curToken.getKind()) {
+    default:
+      emitError("expected a top level entity");
+      return nullptr;
+
+    // If we got to the end of the file, then we're done.
+    case Token::eof:
+      return module.release();
+
+    // If we got an error token, then the lexer already emitted an error, just
+    // stop.  Someday we could introduce error recovery if there was demand for
+    // it.
+    case Token::error:
+      return nullptr;
+
+    case Token::kw_extfunc:
+      if (parseExtFunc())
+        return nullptr;
+      break;
+
+    // TODO: cfgfunc, mlfunc, affine entity declarations, etc.
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+
+/// This parses the file specified by the indicated SourceMgr and returns an
+/// MLIR module if it was valid.  If not, it emits diagnostics and returns null.
+Module *mlir::parseSourceFile(llvm::SourceMgr &sourceMgr) {
+  return Parser(sourceMgr).parseModule();
+}
diff --git a/lib/Parser/Token.cpp b/lib/Parser/Token.cpp
new file mode 100644
index 0000000..551bd1e
--- /dev/null
+++ b/lib/Parser/Token.cpp
@@ -0,0 +1,37 @@
+//===- Token.cpp - MLIR Token Implementation ------------------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements the Token class for the MLIR textual form.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Token.h"
+using namespace mlir;
+using llvm::SMLoc;
+using llvm::SMRange;
+
+SMLoc Token::getLoc() const {
+  return SMLoc::getFromPointer(spelling.data());
+}
+
+SMLoc Token::getEndLoc() const {
+  return SMLoc::getFromPointer(spelling.data() + spelling.size());
+}
+
+SMRange Token::getLocRange() const {
+  return SMRange(getLoc(), getEndLoc());
+}
diff --git a/lib/Parser/Token.h b/lib/Parser/Token.h
new file mode 100644
index 0000000..03c967e
--- /dev/null
+++ b/lib/Parser/Token.h
@@ -0,0 +1,98 @@
+//===- Token.h - MLIR Token Interface ---------------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef MLIR_LIB_PARSER_TOKEN_H
+#define MLIR_LIB_PARSER_TOKEN_H
+
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/SMLoc.h"
+
+namespace mlir {
+
+/// This represents a token in the MLIR syntax.
+class Token {
+public:
+  enum TokenKind {
+    // Markers
+    eof, error,
+
+    // Identifiers.
+    bare_identifier,    // foo
+    at_identifier,      // @foo
+    // TODO: @@foo, etc.
+
+    // Punctuation.
+    l_paren, r_paren,   // ( )
+    less, greater,      // < >
+    // TODO: More punctuation.
+
+    // Keywords.
+    kw_cfgfunc,
+    kw_extfunc,
+    kw_mlfunc,
+    // TODO: More keywords.
+  };
+
+  Token(TokenKind kind, StringRef spelling)
+    : kind(kind), spelling(spelling) {}
+
+  // Return the bytes that make up this token.
+  StringRef getSpelling() const { return spelling; }
+
+  // Token classification.
+  TokenKind getKind() const { return kind; }
+  bool is(TokenKind K) const { return kind == K; }
+
+  bool isAny(TokenKind k1, TokenKind k2) const {
+    return is(k1) || is(k2);
+  }
+
+  /// Return true if this token is one of the specified kinds.
+  template <typename ...T>
+  bool isAny(TokenKind k1, TokenKind k2, TokenKind k3, T... others) const {
+    if (is(k1))
+      return true;
+    return isAny(k2, k3, others...);
+  }
+
+  bool isNot(TokenKind k) const { return kind != k; }
+
+  /// Return true if this token isn't one of the specified kinds.
+  template <typename ...T>
+  bool isNot(TokenKind k1, TokenKind k2, T... others) const {
+    return !isAny(k1, k2, others...);
+  }
+
+
+  /// Location processing.
+  llvm::SMLoc getLoc() const;
+  llvm::SMLoc getEndLoc() const;
+  llvm::SMRange getLocRange() const;
+
+private:
+  /// Discriminator that indicates the sort of token this is.
+  TokenKind kind;
+
+  /// A reference to the entire token contents; this is always a pointer into
+  /// a memory buffer owned by the source manager.
+  StringRef spelling;
+};
+
+} // end namespace mlir
+
+#endif  // MLIR_LIB_PARSER_TOKEN_H
diff --git a/test/IR/check-help-output.mlir b/test/IR/check-help-output.mlir
index 3a62414..617ae78 100644
--- a/test/IR/check-help-output.mlir
+++ b/test/IR/check-help-output.mlir
@@ -1,15 +1,7 @@
-// TODO(andydavis) Resolve relative path issue w.r.t invoking mlir-opt in RUN
-// statements (perhaps through using lit config substitutions).
-//
-// RUN: %S/../../mlir-opt --help | FileCheck --check-prefix=CHECKHELP %s
-// RUN: %S/../../mlir-opt %s -o - | FileCheck %s
-//
-// CHECKHELP: OVERVIEW: MLIR modular optimizer driver
+; TODO(andydavis) Resolve relative path issue w.r.t invoking mlir-opt in RUN
+; statements (perhaps through using lit config substitutions).
+;
+; RUN: %S/../../mlir-opt --help | FileCheck %s
+;
+; CHECK: OVERVIEW: MLIR modular optimizer driver
 
-
-// Right now the input is completely ignored.
-extfunc @foo()
-extfunc @bar()
-
-// CHECK: extfunc @foo()
-// CHECK: extfunc @bar()
diff --git a/test/IR/parser.mlir b/test/IR/parser.mlir
new file mode 100644
index 0000000..21b6a04
--- /dev/null
+++ b/test/IR/parser.mlir
@@ -0,0 +1,15 @@
+; TODO(andydavis) Resolve relative path issue w.r.t invoking mlir-opt in RUN
+; statements (perhaps through using lit config substitutions).
+;
+; RUN: %S/../../mlir-opt %s -o - | FileCheck %s
+
+
+; CHECK: extfunc @foo()
+extfunc @foo()
+
+; CHECK: extfunc @bar()
+extfunc @bar()
+
+; CHECK: extfunc @baz()
+extfunc @baz()
+
diff --git a/tools/mlir-opt/mlir-opt.cpp b/tools/mlir-opt/mlir-opt.cpp
index dee86ed..b5a548d 100644
--- a/tools/mlir-opt/mlir-opt.cpp
+++ b/tools/mlir-opt/mlir-opt.cpp
@@ -22,7 +22,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/IR/Module.h"
+#include "mlir/Parser.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/ToolOutputFile.h"
@@ -56,13 +58,27 @@
 
   cl::ParseCommandLineOptions(argc, argv, "MLIR modular optimizer driver\n");
 
-  // Instantiate an IR object.
-  Module m;
-  m.functionList.push_back(new Function("foo"));
-  m.functionList.push_back(new Function("bar"));
+  // Set up the input file.
+  auto fileOrErr = MemoryBuffer::getFileOrSTDIN(inputFilename);
+  if (std::error_code error = fileOrErr.getError()) {
+    llvm::errs() << argv[0] << ": could not open input file '" << inputFilename
+                 << "': " << error.message() << "\n";
+    return 1;
+  }
+
+  // Tell sourceMgr about this buffer, which is what the parser will pick up.
+  SourceMgr sourceMgr;
+  sourceMgr.AddNewSourceBuffer(std::move(*fileOrErr), SMLoc());
+
+  // Parse the input file and emit any errors.
+  std::unique_ptr<Module> module(parseSourceFile(sourceMgr));
+  if (!module) return 1;
 
   // Print the output.
   auto output = getOutputStream();
-  m.print(output->os());
+  module->print(output->os());
   output->keep();
+
+  // Success.
+  return 0;
 }