blob: 6683788227df7744706e6cd5b6d6e1e413486283 [file] [log] [blame]
//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines lexer for structured comments and supporting token class.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_CLANG_AST_COMMENT_LEXER_H
#define LLVM_CLANG_AST_COMMENT_LEXER_H
#include "clang/Basic/SourceManager.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/raw_ostream.h"
namespace clang {
namespace comments {
class Lexer;
class TextTokenRetokenizer;
namespace tok {
enum TokenKind {
eof,
newline,
text,
command,
verbatim_block_begin,
verbatim_block_line,
verbatim_block_end,
verbatim_line_name,
verbatim_line_text,
html_tag_open, // <tag
html_ident, // attr
html_equals, // =
html_quoted_string, // "blah\"blah" or 'blah\'blah'
html_greater, // >
html_tag_close // </tag
};
} // end namespace tok
class CommentOptions {
public:
bool Markdown;
};
/// \brief Comment token.
class Token {
friend class Lexer;
friend class TextTokenRetokenizer;
/// The location of the token.
SourceLocation Loc;
/// The actual kind of the token.
tok::TokenKind Kind;
/// Length of the token spelling in comment. Can be 0 for synthenized
/// tokens.
unsigned Length;
/// Contains text value associated with a token.
const char *TextPtr1;
unsigned TextLen1;
public:
SourceLocation getLocation() const LLVM_READONLY { return Loc; }
void setLocation(SourceLocation SL) { Loc = SL; }
SourceLocation getEndLocation() const LLVM_READONLY {
if (Length == 0 || Length == 1)
return Loc;
return Loc.getLocWithOffset(Length - 1);
}
tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
void setKind(tok::TokenKind K) { Kind = K; }
bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
unsigned getLength() const LLVM_READONLY { return Length; }
void setLength(unsigned L) { Length = L; }
StringRef getText() const LLVM_READONLY {
assert(is(tok::text));
return StringRef(TextPtr1, TextLen1);
}
void setText(StringRef Text) {
assert(is(tok::text));
TextPtr1 = Text.data();
TextLen1 = Text.size();
}
StringRef getCommandName() const LLVM_READONLY {
assert(is(tok::command));
return StringRef(TextPtr1, TextLen1);
}
void setCommandName(StringRef Name) {
assert(is(tok::command));
TextPtr1 = Name.data();
TextLen1 = Name.size();
}
StringRef getVerbatimBlockName() const LLVM_READONLY {
assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
return StringRef(TextPtr1, TextLen1);
}
void setVerbatimBlockName(StringRef Name) {
assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
TextPtr1 = Name.data();
TextLen1 = Name.size();
}
StringRef getVerbatimBlockText() const LLVM_READONLY {
assert(is(tok::verbatim_block_line));
return StringRef(TextPtr1, TextLen1);
}
void setVerbatimBlockText(StringRef Text) {
assert(is(tok::verbatim_block_line));
TextPtr1 = Text.data();
TextLen1 = Text.size();
}
/// Returns the name of verbatim line command.
StringRef getVerbatimLineName() const LLVM_READONLY {
assert(is(tok::verbatim_line_name));
return StringRef(TextPtr1, TextLen1);
}
void setVerbatimLineName(StringRef Name) {
assert(is(tok::verbatim_line_name));
TextPtr1 = Name.data();
TextLen1 = Name.size();
}
StringRef getVerbatimLineText() const LLVM_READONLY {
assert(is(tok::verbatim_line_text));
return StringRef(TextPtr1, TextLen1);
}
void setVerbatimLineText(StringRef Text) {
assert(is(tok::verbatim_line_text));
TextPtr1 = Text.data();
TextLen1 = Text.size();
}
StringRef getHTMLTagOpenName() const LLVM_READONLY {
assert(is(tok::html_tag_open));
return StringRef(TextPtr1, TextLen1);
}
void setHTMLTagOpenName(StringRef Name) {
assert(is(tok::html_tag_open));
TextPtr1 = Name.data();
TextLen1 = Name.size();
}
StringRef getHTMLIdent() const LLVM_READONLY {
assert(is(tok::html_ident));
return StringRef(TextPtr1, TextLen1);
}
void setHTMLIdent(StringRef Name) {
assert(is(tok::html_ident));
TextPtr1 = Name.data();
TextLen1 = Name.size();
}
StringRef getHTMLQuotedString() const LLVM_READONLY {
assert(is(tok::html_quoted_string));
return StringRef(TextPtr1, TextLen1);
}
void setHTMLQuotedString(StringRef Str) {
assert(is(tok::html_quoted_string));
TextPtr1 = Str.data();
TextLen1 = Str.size();
}
StringRef getHTMLTagCloseName() const LLVM_READONLY {
assert(is(tok::html_tag_close));
return StringRef(TextPtr1, TextLen1);
}
void setHTMLTagCloseName(StringRef Name) {
assert(is(tok::html_tag_close));
TextPtr1 = Name.data();
TextLen1 = Name.size();
}
void dump(const Lexer &L, const SourceManager &SM) const;
};
/// \brief Comment lexer.
class Lexer {
private:
Lexer(const Lexer&); // DO NOT IMPLEMENT
void operator=(const Lexer&); // DO NOT IMPLEMENT
const char *const BufferStart;
const char *const BufferEnd;
SourceLocation FileLoc;
CommentOptions CommOpts;
const char *BufferPtr;
/// One past end pointer for the current comment. For BCPL comments points
/// to newline or BufferEnd, for C comments points to star in '*/'.
const char *CommentEnd;
enum LexerCommentState {
LCS_BeforeComment,
LCS_InsideBCPLComment,
LCS_InsideCComment,
LCS_BetweenComments
};
/// Low-level lexer state, track if we are inside or outside of comment.
LexerCommentState CommentState;
enum LexerState {
/// Lexing normal comment text
LS_Normal,
/// Finished lexing verbatim block beginning command, will lex first body
/// line.
LS_VerbatimBlockFirstLine,
/// Lexing verbatim block body line-by-line, skipping line-starting
/// decorations.
LS_VerbatimBlockBody,
/// Finished lexing verbatim line beginning command, will lex text (one
/// line).
LS_VerbatimLineText,
/// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
LS_HTMLOpenTag,
/// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
LS_HTMLCloseTag
};
/// Current lexing mode.
LexerState State;
/// A verbatim-like block command eats every character (except line starting
/// decorations) until matching end command is seen or comment end is hit.
struct VerbatimBlockCommand {
StringRef BeginName;
StringRef EndName;
};
typedef SmallVector<VerbatimBlockCommand, 4> VerbatimBlockCommandVector;
/// Registered verbatim-like block commands.
VerbatimBlockCommandVector VerbatimBlockCommands;
/// If State is LS_VerbatimBlock, contains the the name of verbatim end
/// command, including command marker.
SmallString<16> VerbatimBlockEndCommandName;
bool isVerbatimBlockCommand(StringRef BeginName, StringRef &EndName) const;
/// A verbatim-like line command eats everything until a newline is seen or
/// comment end is hit.
struct VerbatimLineCommand {
StringRef Name;
};
typedef SmallVector<VerbatimLineCommand, 4> VerbatimLineCommandVector;
/// Registered verbatim-like line commands.
VerbatimLineCommandVector VerbatimLineCommands;
bool isVerbatimLineCommand(StringRef Name) const;
void formTokenWithChars(Token &Result, const char *TokEnd,
tok::TokenKind Kind) {
const unsigned TokLen = TokEnd - BufferPtr;
Result.setLocation(getSourceLocation(BufferPtr));
Result.setKind(Kind);
Result.setLength(TokLen);
#ifndef NDEBUG
Result.TextPtr1 = "<UNSET>";
Result.TextLen1 = 7;
#endif
BufferPtr = TokEnd;
}
SourceLocation getSourceLocation(const char *Loc) const {
assert(Loc >= BufferStart && Loc <= BufferEnd &&
"Location out of range for this buffer!");
const unsigned CharNo = Loc - BufferStart;
return FileLoc.getLocWithOffset(CharNo);
}
/// Eat string matching regexp \code \s*\* \endcode.
void skipLineStartingDecorations();
/// Lex stuff inside comments. CommentEnd should be set correctly.
void lexCommentText(Token &T);
void setupAndLexVerbatimBlock(Token &T,
const char *TextBegin,
char Marker, StringRef EndName);
void lexVerbatimBlockFirstLine(Token &T);
void lexVerbatimBlockBody(Token &T);
void setupAndLexVerbatimLine(Token &T, const char *TextBegin);
void lexVerbatimLineText(Token &T);
void setupAndLexHTMLOpenTag(Token &T);
void lexHTMLOpenTag(Token &T);
void setupAndLexHTMLCloseTag(Token &T);
void lexHTMLCloseTag(Token &T);
public:
Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts,
const char *BufferStart, const char *BufferEnd);
void lex(Token &T);
StringRef getSpelling(const Token &Tok,
const SourceManager &SourceMgr,
bool *Invalid = NULL) const;
/// \brief Register a new verbatim block command.
void addVerbatimBlockCommand(StringRef BeginName, StringRef EndName);
/// \brief Register a new verbatim line command.
void addVerbatimLineCommand(StringRef Name);
};
/// Re-lexes a sequence of tok::text tokens.
class TextTokenRetokenizer {
llvm::BumpPtrAllocator &Allocator;
static const unsigned MaxTokens = 16;
SmallVector<Token, MaxTokens> Toks;
struct Position {
unsigned CurToken;
const char *BufferStart;
const char *BufferEnd;
const char *BufferPtr;
SourceLocation BufferStartLoc;
};
/// Current position in Toks.
Position Pos;
bool isEnd() const {
return Pos.CurToken >= Toks.size();
}
/// Sets up the buffer pointers to point to current token.
void setupBuffer() {
assert(Pos.CurToken < Toks.size());
const Token &Tok = Toks[Pos.CurToken];
Pos.BufferStart = Tok.getText().begin();
Pos.BufferEnd = Tok.getText().end();
Pos.BufferPtr = Pos.BufferStart;
Pos.BufferStartLoc = Tok.getLocation();
}
SourceLocation getSourceLocation() const {
const unsigned CharNo = Pos.BufferPtr - Pos.BufferStart;
return Pos.BufferStartLoc.getLocWithOffset(CharNo);
}
char peek() const {
assert(!isEnd());
assert(Pos.BufferPtr != Pos.BufferEnd);
return *Pos.BufferPtr;
}
void consumeChar() {
assert(!isEnd());
assert(Pos.BufferPtr != Pos.BufferEnd);
Pos.BufferPtr++;
if (Pos.BufferPtr == Pos.BufferEnd) {
Pos.CurToken++;
if (Pos.CurToken < Toks.size())
setupBuffer();
}
}
static bool isWhitespace(char C) {
return C == ' ' || C == '\n' || C == '\r' ||
C == '\t' || C == '\f' || C == '\v';
}
void consumeWhitespace() {
while (!isEnd()) {
if (isWhitespace(peek()))
consumeChar();
else
break;
}
}
void formTokenWithChars(Token &Result,
SourceLocation Loc,
const char *TokBegin,
unsigned TokLength,
StringRef Text) {
Result.setLocation(Loc);
Result.setKind(tok::text);
Result.setLength(TokLength);
#ifndef NDEBUG
Result.TextPtr1 = "<UNSET>";
Result.TextLen1 = 7;
#endif
Result.setText(Text);
}
public:
TextTokenRetokenizer(llvm::BumpPtrAllocator &Allocator):
Allocator(Allocator) {
Pos.CurToken = 0;
}
/// Add a token.
/// Returns true on success, false if it seems like we have enough tokens.
bool addToken(const Token &Tok) {
assert(Tok.is(tok::text));
if (Toks.size() >= MaxTokens)
return false;
Toks.push_back(Tok);
if (Toks.size() == 1)
setupBuffer();
return true;
}
/// Extract a word -- sequence of non-whitespace characters.
bool lexWord(Token &Tok) {
if (isEnd())
return false;
Position SavedPos = Pos;
consumeWhitespace();
SmallString<32> WordText;
const char *WordBegin = Pos.BufferPtr;
SourceLocation Loc = getSourceLocation();
while (!isEnd()) {
const char C = peek();
if (!isWhitespace(C)) {
WordText.push_back(C);
consumeChar();
} else
break;
}
const unsigned Length = WordText.size();
if (Length == 0) {
Pos = SavedPos;
return false;
}
char *TextPtr = Allocator.Allocate<char>(Length + 1);
memcpy(TextPtr, WordText.c_str(), Length + 1);
StringRef Text = StringRef(TextPtr, Length);
formTokenWithChars(Tok, Loc, WordBegin,
Pos.BufferPtr - WordBegin, Text);
return true;
}
bool lexDelimitedSeq(Token &Tok, char OpenDelim, char CloseDelim) {
if (isEnd())
return false;
Position SavedPos = Pos;
consumeWhitespace();
SmallString<32> WordText;
const char *WordBegin = Pos.BufferPtr;
SourceLocation Loc = getSourceLocation();
bool Error = false;
if (!isEnd()) {
const char C = peek();
if (C == OpenDelim) {
WordText.push_back(C);
consumeChar();
} else
Error = true;
}
char C;
while (!Error && !isEnd()) {
C = peek();
WordText.push_back(C);
consumeChar();
if (C == CloseDelim)
break;
}
if (!Error && C != CloseDelim)
Error = true;
if (Error) {
Pos = SavedPos;
return false;
}
const unsigned Length = WordText.size();
char *TextPtr = Allocator.Allocate<char>(Length + 1);
memcpy(TextPtr, WordText.c_str(), Length + 1);
StringRef Text = StringRef(TextPtr, Length);
formTokenWithChars(Tok, Loc, WordBegin,
Pos.BufferPtr - WordBegin, Text);
return true;
}
/// Return a text token. Useful to take tokens back.
bool lexText(Token &Tok) {
if (isEnd())
return false;
if (Pos.BufferPtr != Pos.BufferStart)
formTokenWithChars(Tok, getSourceLocation(),
Pos.BufferPtr, Pos.BufferEnd - Pos.BufferPtr,
StringRef(Pos.BufferPtr,
Pos.BufferEnd - Pos.BufferPtr));
else
Tok = Toks[Pos.CurToken];
Pos.CurToken++;
if (Pos.CurToken < Toks.size())
setupBuffer();
return true;
}
};
} // end namespace comments
} // end namespace clang
#endif