blob: cfff29f9eead4bf5e28a8f77d49ae8bf613c6f9a [file] [log] [blame]
/** \file
* Base interface for any ANTLR3 lexer.
*
* An ANLTR3 lexer builds from two sets of components:
*
* - The runtime components that provide common functionality such as
* traversing character streams, building tokens for output and so on.
* - The generated rules and struutre of the actual lexer, which call upon the
* runtime components.
*
* A lexer class contains a character input stream, a base recognizer interface
* (which it will normally implement) and a token source interface (which it also
* implements. The Tokensource interface is called by a token consumer (such as
* a parser, but in theory it can be anything that wants a set of abstract
* tokens in place of a raw character stream.
*
* So then, we set up a lexer in a sequence akin to:
*
* - Create a character stream (something which implements ANTLR3_INPUT_STREAM)
* and initialize it.
* - Create a lexer interface and tell it where it its input stream is.
* This will cause the creation of a base recognizer class, which it will
* override with its own implementations of some methods. The lexer creator
* can also then in turn override anything it likes.
* - The lexer token source interface is then passed to some interface that
* knows how to use it, byte calling for a next token.
* - When a next token is called, let ze lexing begin.
*
*/
#ifndef _ANTLR3_LEXER_HPP
#define _ANTLR3_LEXER_HPP
// [The "BSD licence"]
// Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// 3. The name of the author may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* Definitions
*/
#include "antlr3defs.hpp"
ANTLR_BEGIN_NAMESPACE()
static const ANTLR_UINT32 ANTLR_STRING_TERMINATOR = 0xFFFFFFFF;
template<class ImplTraits>
class Lexer : public ImplTraits::template RecognizerType< typename ImplTraits::InputStreamType >,
public ImplTraits::TokenSourceType
{
public:
typedef typename ImplTraits::AllocPolicyType AllocPolicyType;
typedef typename ImplTraits::InputStreamType InputStreamType;
typedef InputStreamType StreamType;
typedef typename InputStreamType::IntStreamType IntStreamType;
typedef typename ImplTraits::CommonTokenType CommonTokenType;
typedef typename ImplTraits::StreamDataType TokenType;
typedef typename ImplTraits::StringType StringType;
typedef typename ImplTraits::StringStreamType StringStreamType;
typedef typename ImplTraits::template RecognizerType< InputStreamType > RecognizerType;
typedef typename RecognizerType::RecognizerSharedStateType RecognizerSharedStateType;
typedef typename ImplTraits::template ExceptionBaseType<InputStreamType> ExceptionBaseType;
typedef typename ImplTraits::BitsetListType BitsetListType;
typedef typename ImplTraits::TokenSourceType TokenSourceType;
typedef typename RecognizerSharedStateType::RuleMemoType RuleMemoType;
typedef typename RecognizerType::DebugEventListenerType DebuggerType;
private:
/** A pointer to the character stream whence this lexer is receiving
* characters.
* TODO: I may come back to this and implement charstream outside
* the input stream as per the java implementation.
*/
InputStreamType* m_input;
public:
Lexer(ANTLR_UINT32 sizeHint, RecognizerSharedStateType* state);
Lexer(ANTLR_UINT32 sizeHint, InputStreamType* input, RecognizerSharedStateType* state);
InputStreamType* get_input() const;
IntStreamType* get_istream() const;
RecognizerType* get_rec();
const RecognizerType* get_rec() const;
TokenSourceType* get_tokSource();
//functions used in .stg file
const RecognizerType* get_recognizer() const;
RecognizerSharedStateType* get_lexstate() const;
void set_lexstate( RecognizerSharedStateType* lexstate );
const TokenSourceType* get_tokSource() const;
CommonTokenType* get_ltoken() const;
void set_ltoken( const CommonTokenType* ltoken );
bool hasFailed() const;
ANTLR_INT32 get_backtracking() const;
void inc_backtracking();
void dec_backtracking();
bool get_failedflag() const;
void set_failedflag( bool failed );
InputStreamType* get_strstream() const;
ANTLR_MARKER index() const;
void seek(ANTLR_MARKER index);
const CommonTokenType* EOF_Token() const;
bool hasException() const;
ExceptionBaseType* get_exception() const;
void constructEx();
void lrecover();
ANTLR_MARKER mark();
void rewind(ANTLR_MARKER marker);
void rewindLast();
void setText( const StringType& text );
void skip();
RuleMemoType* getRuleMemo() const;
DebuggerType* get_debugger() const;
void setRuleMemo(RuleMemoType* rulememo);
ANTLR_UINT32 LA(ANTLR_INT32 i);
void consume();
void memoize(ANTLR_MARKER ruleIndex, ANTLR_MARKER ruleParseStart);
bool haveParsedRule(ANTLR_MARKER ruleIndex);
/** Pointer to a function that sets the charstream source for the lexer and
* causes it to be reset.
*/
void setCharStream(InputStreamType* input);
/*!
* \brief
* Change to a new input stream, remembering the old one.
*
* \param lexer
* Pointer to the lexer instance to switch input streams for.
*
* \param input
* New input stream to install as the current one.
*
* Switches the current character input stream to
* a new one, saving the old one, which we will revert to at the end of this
* new one.
*/
void pushCharStream(InputStreamType* input);
/*!
* \brief
* Stops using the current input stream and reverts to any prior
* input stream on the stack.
*
* \param lexer
* Description of parameter lexer.
*
* Pointer to a function that abandons the current input stream, whether it
* is empty or not and reverts to the previous stacked input stream.
*
* \remark
* The function fails silently if there are no prior input streams.
*/
void popCharStream();
/** Function that emits (a copy of ) the supplied token as the next token in
* the stream.
*/
void emit(const CommonTokenType* token);
/** Pointer to a function that constructs a new token from the lexer stored information
*/
CommonTokenType* emit();
/** Pointer to a function that attempts to match and consume the specified string from the input
* stream. Note that strings muse be passed as terminated arrays of ANTLR3_UCHAR. Strings are terminated
* with 0xFFFFFFFF, which is an invalid UTF32 character
*/
bool matchs(ANTLR_UCHAR* string);
/** Pointer to a function that matches and consumes the specified character from the input stream.
* The input stream is required to provide characters via LA() as UTF32 characters. The default lexer
* implementation is source encoding agnostic and so input streams do not generally need to
* override the default implmentation.
*/
bool matchc(ANTLR_UCHAR c);
/** Pointer to a function that matches any character in the supplied range (I suppose it could be a token range too
* but this would only be useful if the tokens were in tsome guaranteed order which is
* only going to happen with a hand crafted token set).
*/
bool matchRange(ANTLR_UCHAR low, ANTLR_UCHAR high);
/** Pointer to a function that matches the next token/char in the input stream
* regardless of what it actaully is.
*/
void matchAny();
/** Pointer to a function that recovers from an error found in the input stream.
* Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also
* be from a mismatched token that the (*match)() could not recover from.
*/
void recover();
/** Function to return the current line number in the input stream
*/
ANTLR_UINT32 getLine();
ANTLR_MARKER getCharIndex();
ANTLR_UINT32 getCharPositionInLine();
/** Function to return the text so far for the current token being generated
*/
StringType getText();
//Other utility functions
void fillExceptionData( ExceptionBaseType* ex );
/** Default lexer error handler (works for 8 bit streams only!!!)
*/
void displayRecognitionError( ANTLR_UINT8** tokenNames, ExceptionBaseType* ex);
void exConstruct();
TokenType* getMissingSymbol( IntStreamType* istream, ExceptionBaseType* e,
ANTLR_UINT32 expectedTokenType, BitsetListType* follow);
/** Pointer to a function that knows how to free the resources of a lexer
*/
~Lexer();
};
ANTLR_END_NAMESPACE()
#include "antlr3lexer.inl"
#endif