| //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// |
| // |
| // The LLVM Compiler Infrastructure |
| // |
| // This file is distributed under the University of Illinois Open Source |
| // License. See LICENSE.TXT for details. |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file defines lexer for structured comments and supporting token class. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef LLVM_CLANG_AST_COMMENT_LEXER_H |
| #define LLVM_CLANG_AST_COMMENT_LEXER_H |
| |
| #include "clang/Basic/SourceManager.h" |
| #include "llvm/ADT/StringRef.h" |
| #include "llvm/ADT/SmallString.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/Support/Allocator.h" |
| #include "llvm/Support/raw_ostream.h" |
| |
| namespace clang { |
| namespace comments { |
| |
| class Lexer; |
| class TextTokenRetokenizer; |
| |
| namespace tok { |
| enum TokenKind { |
| eof, |
| newline, |
| text, |
| command, |
| verbatim_block_begin, |
| verbatim_block_line, |
| verbatim_block_end, |
| verbatim_line_name, |
| verbatim_line_text, |
| html_tag_open, // <tag |
| html_ident, // attr |
| html_equals, // = |
| html_quoted_string, // "blah\"blah" or 'blah\'blah' |
| html_greater, // > |
| html_tag_close // </tag |
| }; |
| } // end namespace tok |
| |
| class CommentOptions { |
| public: |
| bool Markdown; |
| }; |
| |
| /// \brief Comment token. |
| class Token { |
| friend class Lexer; |
| friend class TextTokenRetokenizer; |
| |
| /// The location of the token. |
| SourceLocation Loc; |
| |
| /// The actual kind of the token. |
| tok::TokenKind Kind; |
| |
| /// Length of the token spelling in comment. Can be 0 for synthenized |
| /// tokens. |
| unsigned Length; |
| |
| /// Contains text value associated with a token. |
| const char *TextPtr1; |
| unsigned TextLen1; |
| |
| public: |
| SourceLocation getLocation() const LLVM_READONLY { return Loc; } |
| void setLocation(SourceLocation SL) { Loc = SL; } |
| |
| SourceLocation getEndLocation() const LLVM_READONLY { |
| if (Length == 0 || Length == 1) |
| return Loc; |
| return Loc.getLocWithOffset(Length - 1); |
| } |
| |
| tok::TokenKind getKind() const LLVM_READONLY { return Kind; } |
| void setKind(tok::TokenKind K) { Kind = K; } |
| |
| bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; } |
| bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; } |
| |
| unsigned getLength() const LLVM_READONLY { return Length; } |
| void setLength(unsigned L) { Length = L; } |
| |
| StringRef getText() const LLVM_READONLY { |
| assert(is(tok::text)); |
| return StringRef(TextPtr1, TextLen1); |
| } |
| |
| void setText(StringRef Text) { |
| assert(is(tok::text)); |
| TextPtr1 = Text.data(); |
| TextLen1 = Text.size(); |
| } |
| |
| StringRef getCommandName() const LLVM_READONLY { |
| assert(is(tok::command)); |
| return StringRef(TextPtr1, TextLen1); |
| } |
| |
| void setCommandName(StringRef Name) { |
| assert(is(tok::command)); |
| TextPtr1 = Name.data(); |
| TextLen1 = Name.size(); |
| } |
| |
| StringRef getVerbatimBlockName() const LLVM_READONLY { |
| assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); |
| return StringRef(TextPtr1, TextLen1); |
| } |
| |
| void setVerbatimBlockName(StringRef Name) { |
| assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); |
| TextPtr1 = Name.data(); |
| TextLen1 = Name.size(); |
| } |
| |
| StringRef getVerbatimBlockText() const LLVM_READONLY { |
| assert(is(tok::verbatim_block_line)); |
| return StringRef(TextPtr1, TextLen1); |
| } |
| |
| void setVerbatimBlockText(StringRef Text) { |
| assert(is(tok::verbatim_block_line)); |
| TextPtr1 = Text.data(); |
| TextLen1 = Text.size(); |
| } |
| |
| /// Returns the name of verbatim line command. |
| StringRef getVerbatimLineName() const LLVM_READONLY { |
| assert(is(tok::verbatim_line_name)); |
| return StringRef(TextPtr1, TextLen1); |
| } |
| |
| void setVerbatimLineName(StringRef Name) { |
| assert(is(tok::verbatim_line_name)); |
| TextPtr1 = Name.data(); |
| TextLen1 = Name.size(); |
| } |
| |
| StringRef getVerbatimLineText() const LLVM_READONLY { |
| assert(is(tok::verbatim_line_text)); |
| return StringRef(TextPtr1, TextLen1); |
| } |
| |
| void setVerbatimLineText(StringRef Text) { |
| assert(is(tok::verbatim_line_text)); |
| TextPtr1 = Text.data(); |
| TextLen1 = Text.size(); |
| } |
| |
| StringRef getHTMLTagOpenName() const LLVM_READONLY { |
| assert(is(tok::html_tag_open)); |
| return StringRef(TextPtr1, TextLen1); |
| } |
| |
| void setHTMLTagOpenName(StringRef Name) { |
| assert(is(tok::html_tag_open)); |
| TextPtr1 = Name.data(); |
| TextLen1 = Name.size(); |
| } |
| |
| StringRef getHTMLIdent() const LLVM_READONLY { |
| assert(is(tok::html_ident)); |
| return StringRef(TextPtr1, TextLen1); |
| } |
| |
| void setHTMLIdent(StringRef Name) { |
| assert(is(tok::html_ident)); |
| TextPtr1 = Name.data(); |
| TextLen1 = Name.size(); |
| } |
| |
| StringRef getHTMLQuotedString() const LLVM_READONLY { |
| assert(is(tok::html_quoted_string)); |
| return StringRef(TextPtr1, TextLen1); |
| } |
| |
| void setHTMLQuotedString(StringRef Str) { |
| assert(is(tok::html_quoted_string)); |
| TextPtr1 = Str.data(); |
| TextLen1 = Str.size(); |
| } |
| |
| StringRef getHTMLTagCloseName() const LLVM_READONLY { |
| assert(is(tok::html_tag_close)); |
| return StringRef(TextPtr1, TextLen1); |
| } |
| |
| void setHTMLTagCloseName(StringRef Name) { |
| assert(is(tok::html_tag_close)); |
| TextPtr1 = Name.data(); |
| TextLen1 = Name.size(); |
| } |
| |
| void dump(const Lexer &L, const SourceManager &SM) const; |
| }; |
| |
| /// \brief Comment lexer. |
| class Lexer { |
| private: |
| Lexer(const Lexer&); // DO NOT IMPLEMENT |
| void operator=(const Lexer&); // DO NOT IMPLEMENT |
| |
| const char *const BufferStart; |
| const char *const BufferEnd; |
| SourceLocation FileLoc; |
| CommentOptions CommOpts; |
| |
| const char *BufferPtr; |
| |
| /// One past end pointer for the current comment. For BCPL comments points |
| /// to newline or BufferEnd, for C comments points to star in '*/'. |
| const char *CommentEnd; |
| |
| enum LexerCommentState { |
| LCS_BeforeComment, |
| LCS_InsideBCPLComment, |
| LCS_InsideCComment, |
| LCS_BetweenComments |
| }; |
| |
| /// Low-level lexer state, track if we are inside or outside of comment. |
| LexerCommentState CommentState; |
| |
| enum LexerState { |
| /// Lexing normal comment text |
| LS_Normal, |
| |
| /// Finished lexing verbatim block beginning command, will lex first body |
| /// line. |
| LS_VerbatimBlockFirstLine, |
| |
| /// Lexing verbatim block body line-by-line, skipping line-starting |
| /// decorations. |
| LS_VerbatimBlockBody, |
| |
| /// Finished lexing verbatim line beginning command, will lex text (one |
| /// line). |
| LS_VerbatimLineText, |
| |
| /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. |
| LS_HTMLOpenTag, |
| |
| /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'. |
| LS_HTMLCloseTag |
| }; |
| |
| /// Current lexing mode. |
| LexerState State; |
| |
| /// A verbatim-like block command eats every character (except line starting |
| /// decorations) until matching end command is seen or comment end is hit. |
| struct VerbatimBlockCommand { |
| StringRef BeginName; |
| StringRef EndName; |
| }; |
| |
| typedef SmallVector<VerbatimBlockCommand, 4> VerbatimBlockCommandVector; |
| |
| /// Registered verbatim-like block commands. |
| VerbatimBlockCommandVector VerbatimBlockCommands; |
| |
| /// If State is LS_VerbatimBlock, contains the the name of verbatim end |
| /// command, including command marker. |
| SmallString<16> VerbatimBlockEndCommandName; |
| |
| bool isVerbatimBlockCommand(StringRef BeginName, StringRef &EndName) const; |
| |
| /// A verbatim-like line command eats everything until a newline is seen or |
| /// comment end is hit. |
| struct VerbatimLineCommand { |
| StringRef Name; |
| }; |
| |
| typedef SmallVector<VerbatimLineCommand, 4> VerbatimLineCommandVector; |
| |
| /// Registered verbatim-like line commands. |
| VerbatimLineCommandVector VerbatimLineCommands; |
| |
| bool isVerbatimLineCommand(StringRef Name) const; |
| |
| void formTokenWithChars(Token &Result, const char *TokEnd, |
| tok::TokenKind Kind) { |
| const unsigned TokLen = TokEnd - BufferPtr; |
| Result.setLocation(getSourceLocation(BufferPtr)); |
| Result.setKind(Kind); |
| Result.setLength(TokLen); |
| #ifndef NDEBUG |
| Result.TextPtr1 = "<UNSET>"; |
| Result.TextLen1 = 7; |
| #endif |
| BufferPtr = TokEnd; |
| } |
| |
| SourceLocation getSourceLocation(const char *Loc) const { |
| assert(Loc >= BufferStart && Loc <= BufferEnd && |
| "Location out of range for this buffer!"); |
| |
| const unsigned CharNo = Loc - BufferStart; |
| return FileLoc.getLocWithOffset(CharNo); |
| } |
| |
| /// Eat string matching regexp \code \s*\* \endcode. |
| void skipLineStartingDecorations(); |
| |
| /// Lex stuff inside comments. CommentEnd should be set correctly. |
| void lexCommentText(Token &T); |
| |
| void setupAndLexVerbatimBlock(Token &T, |
| const char *TextBegin, |
| char Marker, StringRef EndName); |
| |
| void lexVerbatimBlockFirstLine(Token &T); |
| |
| void lexVerbatimBlockBody(Token &T); |
| |
| void setupAndLexVerbatimLine(Token &T, const char *TextBegin); |
| |
| void lexVerbatimLineText(Token &T); |
| |
| void setupAndLexHTMLOpenTag(Token &T); |
| |
| void lexHTMLOpenTag(Token &T); |
| |
| void setupAndLexHTMLCloseTag(Token &T); |
| |
| void lexHTMLCloseTag(Token &T); |
| |
| public: |
| Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts, |
| const char *BufferStart, const char *BufferEnd); |
| |
| void lex(Token &T); |
| |
| StringRef getSpelling(const Token &Tok, |
| const SourceManager &SourceMgr, |
| bool *Invalid = NULL) const; |
| |
| /// \brief Register a new verbatim block command. |
| void addVerbatimBlockCommand(StringRef BeginName, StringRef EndName); |
| |
| /// \brief Register a new verbatim line command. |
| void addVerbatimLineCommand(StringRef Name); |
| }; |
| |
| /// Re-lexes a sequence of tok::text tokens. |
| class TextTokenRetokenizer { |
| llvm::BumpPtrAllocator &Allocator; |
| static const unsigned MaxTokens = 16; |
| SmallVector<Token, MaxTokens> Toks; |
| |
| struct Position { |
| unsigned CurToken; |
| const char *BufferStart; |
| const char *BufferEnd; |
| const char *BufferPtr; |
| SourceLocation BufferStartLoc; |
| }; |
| |
| /// Current position in Toks. |
| Position Pos; |
| |
| bool isEnd() const { |
| return Pos.CurToken >= Toks.size(); |
| } |
| |
| /// Sets up the buffer pointers to point to current token. |
| void setupBuffer() { |
| assert(Pos.CurToken < Toks.size()); |
| const Token &Tok = Toks[Pos.CurToken]; |
| |
| Pos.BufferStart = Tok.getText().begin(); |
| Pos.BufferEnd = Tok.getText().end(); |
| Pos.BufferPtr = Pos.BufferStart; |
| Pos.BufferStartLoc = Tok.getLocation(); |
| } |
| |
| SourceLocation getSourceLocation() const { |
| const unsigned CharNo = Pos.BufferPtr - Pos.BufferStart; |
| return Pos.BufferStartLoc.getLocWithOffset(CharNo); |
| } |
| |
| char peek() const { |
| assert(!isEnd()); |
| assert(Pos.BufferPtr != Pos.BufferEnd); |
| return *Pos.BufferPtr; |
| } |
| |
| void consumeChar() { |
| assert(!isEnd()); |
| assert(Pos.BufferPtr != Pos.BufferEnd); |
| Pos.BufferPtr++; |
| if (Pos.BufferPtr == Pos.BufferEnd) { |
| Pos.CurToken++; |
| if (Pos.CurToken < Toks.size()) |
| setupBuffer(); |
| } |
| } |
| |
| static bool isWhitespace(char C) { |
| return C == ' ' || C == '\n' || C == '\r' || |
| C == '\t' || C == '\f' || C == '\v'; |
| } |
| |
| void consumeWhitespace() { |
| while (!isEnd()) { |
| if (isWhitespace(peek())) |
| consumeChar(); |
| else |
| break; |
| } |
| } |
| |
| void formTokenWithChars(Token &Result, |
| SourceLocation Loc, |
| const char *TokBegin, |
| unsigned TokLength, |
| StringRef Text) { |
| Result.setLocation(Loc); |
| Result.setKind(tok::text); |
| Result.setLength(TokLength); |
| #ifndef NDEBUG |
| Result.TextPtr1 = "<UNSET>"; |
| Result.TextLen1 = 7; |
| #endif |
| Result.setText(Text); |
| } |
| |
| public: |
| TextTokenRetokenizer(llvm::BumpPtrAllocator &Allocator): |
| Allocator(Allocator) { |
| Pos.CurToken = 0; |
| } |
| |
| /// Add a token. |
| /// Returns true on success, false if it seems like we have enough tokens. |
| bool addToken(const Token &Tok) { |
| assert(Tok.is(tok::text)); |
| if (Toks.size() >= MaxTokens) |
| return false; |
| |
| Toks.push_back(Tok); |
| if (Toks.size() == 1) |
| setupBuffer(); |
| return true; |
| } |
| |
| /// Extract a word -- sequence of non-whitespace characters. |
| bool lexWord(Token &Tok) { |
| if (isEnd()) |
| return false; |
| |
| Position SavedPos = Pos; |
| |
| consumeWhitespace(); |
| SmallString<32> WordText; |
| const char *WordBegin = Pos.BufferPtr; |
| SourceLocation Loc = getSourceLocation(); |
| while (!isEnd()) { |
| const char C = peek(); |
| if (!isWhitespace(C)) { |
| WordText.push_back(C); |
| consumeChar(); |
| } else |
| break; |
| } |
| const unsigned Length = WordText.size(); |
| if (Length == 0) { |
| Pos = SavedPos; |
| return false; |
| } |
| |
| char *TextPtr = Allocator.Allocate<char>(Length + 1); |
| |
| memcpy(TextPtr, WordText.c_str(), Length + 1); |
| StringRef Text = StringRef(TextPtr, Length); |
| |
| formTokenWithChars(Tok, Loc, WordBegin, |
| Pos.BufferPtr - WordBegin, Text); |
| return true; |
| } |
| |
| bool lexDelimitedSeq(Token &Tok, char OpenDelim, char CloseDelim) { |
| if (isEnd()) |
| return false; |
| |
| Position SavedPos = Pos; |
| |
| consumeWhitespace(); |
| SmallString<32> WordText; |
| const char *WordBegin = Pos.BufferPtr; |
| SourceLocation Loc = getSourceLocation(); |
| bool Error = false; |
| if (!isEnd()) { |
| const char C = peek(); |
| if (C == OpenDelim) { |
| WordText.push_back(C); |
| consumeChar(); |
| } else |
| Error = true; |
| } |
| char C; |
| while (!Error && !isEnd()) { |
| C = peek(); |
| WordText.push_back(C); |
| consumeChar(); |
| if (C == CloseDelim) |
| break; |
| } |
| if (!Error && C != CloseDelim) |
| Error = true; |
| |
| if (Error) { |
| Pos = SavedPos; |
| return false; |
| } |
| |
| const unsigned Length = WordText.size(); |
| char *TextPtr = Allocator.Allocate<char>(Length + 1); |
| |
| memcpy(TextPtr, WordText.c_str(), Length + 1); |
| StringRef Text = StringRef(TextPtr, Length); |
| |
| formTokenWithChars(Tok, Loc, WordBegin, |
| Pos.BufferPtr - WordBegin, Text); |
| return true; |
| } |
| |
| /// Return a text token. Useful to take tokens back. |
| bool lexText(Token &Tok) { |
| if (isEnd()) |
| return false; |
| |
| if (Pos.BufferPtr != Pos.BufferStart) |
| formTokenWithChars(Tok, getSourceLocation(), |
| Pos.BufferPtr, Pos.BufferEnd - Pos.BufferPtr, |
| StringRef(Pos.BufferPtr, |
| Pos.BufferEnd - Pos.BufferPtr)); |
| else |
| Tok = Toks[Pos.CurToken]; |
| |
| Pos.CurToken++; |
| if (Pos.CurToken < Toks.size()) |
| setupBuffer(); |
| return true; |
| } |
| }; |
| |
| } // end namespace comments |
| } // end namespace clang |
| |
| #endif |
| |