runtime/Cpp/include/antlr3lexer.hpp - platform/external/antlr - Git at Google

 /** \file
  * Base interface for any ANTLR3 lexer.
  *
  * An ANLTR3 lexer builds from two sets of components:
  *
  *  - The runtime components that provide common functionality such as
  *    traversing character streams, building tokens for output and so on.
  *  - The generated rules and struutre of the actual lexer, which call upon the
  *    runtime components.
  *
  * A lexer class contains  a character input stream, a base recognizer interface
  * (which it will normally implement) and a token source interface (which it also
  * implements. The Tokensource interface is called by a token consumer (such as
  * a parser, but in theory it can be anything that wants a set of abstract
  * tokens in place of a raw character stream.
  *
  * So then, we set up a lexer in a sequence akin to:
  *
  *  - Create a character stream (something which implements ANTLR3_INPUT_STREAM)
  *    and initialize it.
  *  - Create a lexer interface and tell it where it its input stream is.
  *    This will cause the creation of a base recognizer class, which it will
  *    override with its own implementations of some methods. The lexer creator
  *    can also then in turn override anything it likes.
  *  - The lexer token source interface is then passed to some interface that
  *    knows how to use it, byte calling for a next token.
  *  - When a next token is called, let ze lexing begin.
  *
  */
 #ifndef	_ANTLR3_LEXER_HPP
 #define	_ANTLR3_LEXER_HPP

 // [The "BSD licence"]
 // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB

 //
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
 // are met:
 // 1. Redistributions of source code must retain the above copyright
 //    notice, this list of conditions and the following disclaimer.
 // 2. Redistributions in binary form must reproduce the above copyright
 //    notice, this list of conditions and the following disclaimer in the
 //    documentation and/or other materials provided with the distribution.
 // 3. The name of the author may not be used to endorse or promote products
 //    derived from this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 /* Definitions
  */
 #include    "antlr3defs.hpp"

 ANTLR_BEGIN_NAMESPACE()

 static const ANTLR_UINT32	ANTLR_STRING_TERMINATOR	= 0xFFFFFFFF;

 template<class ImplTraits>
 class  Lexer : public ImplTraits::template RecognizerType< typename ImplTraits::InputStreamType >,
 			   public ImplTraits::TokenSourceType
 {
 public:
 	typedef typename ImplTraits::AllocPolicyType AllocPolicyType;
 	typedef typename ImplTraits::InputStreamType InputStreamType;
 	typedef InputStreamType StreamType;
 	typedef typename InputStreamType::IntStreamType IntStreamType;
 	typedef typename ImplTraits::CommonTokenType CommonTokenType;
 	typedef typename ImplTraits::StreamDataType TokenType;
 	typedef typename ImplTraits::StringType StringType;
 	typedef typename ImplTraits::StringStreamType StringStreamType;
 	typedef typename ImplTraits::template RecognizerType< InputStreamType > RecognizerType;
 	typedef typename RecognizerType::RecognizerSharedStateType RecognizerSharedStateType;
 	typedef typename ImplTraits::template ExceptionBaseType<InputStreamType> ExceptionBaseType;
 	typedef typename ImplTraits::BitsetListType BitsetListType;
 	typedef typename ImplTraits::TokenSourceType TokenSourceType;

 	typedef typename RecognizerSharedStateType::RuleMemoType RuleMemoType;
 	typedef typename RecognizerType::DebugEventListenerType DebuggerType;

 private:
     /** A pointer to the character stream whence this lexer is receiving
      *  characters.
      *  TODO: I may come back to this and implement charstream outside
      *  the input stream as per the java implementation.
      */
     InputStreamType*		m_input;

 public:
 	Lexer(ANTLR_UINT32 sizeHint, RecognizerSharedStateType* state);
 	Lexer(ANTLR_UINT32 sizeHint, InputStreamType* input, RecognizerSharedStateType* state);

 	InputStreamType* get_input() const;
 	IntStreamType* get_istream() const;
 	RecognizerType* get_rec();
 	const RecognizerType* get_rec() const;
 	TokenSourceType* get_tokSource();

 	//functions used in .stg file
 	const RecognizerType* get_recognizer() const;
 	RecognizerSharedStateType* get_lexstate() const;
 	void set_lexstate( RecognizerSharedStateType* lexstate );
 	const TokenSourceType* get_tokSource() const;
 	CommonTokenType* get_ltoken() const;
 	void set_ltoken( const CommonTokenType* ltoken );
 	bool hasFailed() const;
 	ANTLR_INT32 get_backtracking() const;
 	void inc_backtracking();
 	void dec_backtracking();
 	bool get_failedflag() const;
 	void set_failedflag( bool failed );
 	InputStreamType* get_strstream() const;
 	ANTLR_MARKER  index() const;
 	void	seek(ANTLR_MARKER index);
 	const CommonTokenType* EOF_Token() const;
 	bool hasException() const;
 	ExceptionBaseType* get_exception() const;
 	void constructEx();
 	void lrecover();
 	ANTLR_MARKER mark();
 	void rewind(ANTLR_MARKER marker);
 	void rewindLast();
 	void setText( const StringType& text );
 	void skip();
 	RuleMemoType* getRuleMemo() const;
 	DebuggerType* get_debugger() const;
 	void setRuleMemo(RuleMemoType* rulememo);
 	ANTLR_UINT32 LA(ANTLR_INT32 i);
 	void consume();
 	void memoize(ANTLR_MARKER	ruleIndex, ANTLR_MARKER	ruleParseStart);
 	bool haveParsedRule(ANTLR_MARKER	ruleIndex);

     /** Pointer to a function that sets the charstream source for the lexer and
      *  causes it to  be reset.
      */
     void	setCharStream(InputStreamType* input);

     /*!
 	 * \brief
 	 * Change to a new input stream, remembering the old one.
 	 *
 	 * \param lexer
 	 * Pointer to the lexer instance to switch input streams for.
 	 *
 	 * \param input
 	 * New input stream to install as the current one.
 	 *
 	 * Switches the current character input stream to
 	 * a new one, saving the old one, which we will revert to at the end of this
 	 * new one.
 	 */
     void	pushCharStream(InputStreamType* input);

 	/*!
 	 * \brief
 	 * Stops using the current input stream and reverts to any prior
 	 * input stream on the stack.
 	 *
 	 * \param lexer
 	 * Description of parameter lexer.
 	 *
 	 * Pointer to a function that abandons the current input stream, whether it
 	 * is empty or not and reverts to the previous stacked input stream.
 	 *
 	 * \remark
 	 * The function fails silently if there are no prior input streams.
 	 */
     void	popCharStream();

     /** Function that emits (a copy of ) the supplied token as the next token in
      *  the stream.
      */
     void	emit(const CommonTokenType* token);

     /** Pointer to a function that constructs a new token from the lexer stored information
      */
     CommonTokenType*	emit();

     /** Pointer to a function that attempts to match and consume the specified string from the input
      *  stream. Note that strings muse be passed as terminated arrays of ANTLR3_UCHAR. Strings are terminated
      *  with 0xFFFFFFFF, which is an invalid UTF32 character
      */
     bool	matchs(ANTLR_UCHAR* string);

     /** Pointer to a function that matches and consumes the specified character from the input stream.
      *  The input stream is required to provide characters via LA() as UTF32 characters. The default lexer
      *  implementation is source encoding agnostic and so input streams do not generally need to
      *  override the default implmentation.
      */
     bool	matchc(ANTLR_UCHAR c);

     /** Pointer to a function that matches any character in the supplied range (I suppose it could be a token range too
      *  but this would only be useful if the tokens were in tsome guaranteed order which is
      *  only going to happen with a hand crafted token set).
      */
     bool	matchRange(ANTLR_UCHAR low, ANTLR_UCHAR high);

     /** Pointer to a function that matches the next token/char in the input stream
      *  regardless of what it actaully is.
      */
     void		matchAny();

     /** Pointer to a function that recovers from an error found in the input stream.
      *  Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also
      *  be from a mismatched token that the (*match)() could not recover from.
      */
     void		recover();

     /** Function to return the current line number in the input stream
      */
     ANTLR_UINT32	getLine();
     ANTLR_MARKER	getCharIndex();
     ANTLR_UINT32	getCharPositionInLine();

     /** Function to return the text so far for the current token being generated
      */
     StringType 	getText();

 	//Other utility functions
 	void fillExceptionData( ExceptionBaseType* ex );

 	/** Default lexer error handler (works for 8 bit streams only!!!)
 	 */
 	void displayRecognitionError( ANTLR_UINT8** tokenNames, ExceptionBaseType* ex);
 	void exConstruct();
 	TokenType*	getMissingSymbol( IntStreamType* istream, ExceptionBaseType* e,
 								  ANTLR_UINT32	expectedTokenType, BitsetListType*	follow);

     /** Pointer to a function that knows how to free the resources of a lexer
      */
 	~Lexer();
 };

 ANTLR_END_NAMESPACE()

 #include "antlr3lexer.inl"

 #endif
	/** \file
	* Base interface for any ANTLR3 lexer.
	*
	* An ANLTR3 lexer builds from two sets of components:
	*
	* - The runtime components that provide common functionality such as
	* traversing character streams, building tokens for output and so on.
	* - The generated rules and struutre of the actual lexer, which call upon the
	* runtime components.
	*
	* A lexer class contains a character input stream, a base recognizer interface
	* (which it will normally implement) and a token source interface (which it also
	* implements. The Tokensource interface is called by a token consumer (such as
	* a parser, but in theory it can be anything that wants a set of abstract
	* tokens in place of a raw character stream.
	*
	* So then, we set up a lexer in a sequence akin to:
	*
	* - Create a character stream (something which implements ANTLR3_INPUT_STREAM)
	* and initialize it.
	* - Create a lexer interface and tell it where it its input stream is.
	* This will cause the creation of a base recognizer class, which it will
	* override with its own implementations of some methods. The lexer creator
	* can also then in turn override anything it likes.
	* - The lexer token source interface is then passed to some interface that
	* knows how to use it, byte calling for a next token.
	* - When a next token is called, let ze lexing begin.
	*
	*/
	#ifndef _ANTLR3_LEXER_HPP
	#define _ANTLR3_LEXER_HPP

	// [The "BSD licence"]
	// Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB

	//
	// All rights reserved.
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions
	// are met:
	// 1. Redistributions of source code must retain the above copyright
	// notice, this list of conditions and the following disclaimer.
	// 2. Redistributions in binary form must reproduce the above copyright
	// notice, this list of conditions and the following disclaimer in the
	// documentation and/or other materials provided with the distribution.
	// 3. The name of the author may not be used to endorse or promote products
	// derived from this software without specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	/* Definitions
	*/
	#include "antlr3defs.hpp"

	ANTLR_BEGIN_NAMESPACE()

	static const ANTLR_UINT32 ANTLR_STRING_TERMINATOR = 0xFFFFFFFF;

	template<class ImplTraits>
	class Lexer : public ImplTraits::template RecognizerType< typename ImplTraits::InputStreamType >,
	public ImplTraits::TokenSourceType
	{
	public:
	typedef typename ImplTraits::AllocPolicyType AllocPolicyType;
	typedef typename ImplTraits::InputStreamType InputStreamType;
	typedef InputStreamType StreamType;
	typedef typename InputStreamType::IntStreamType IntStreamType;
	typedef typename ImplTraits::CommonTokenType CommonTokenType;
	typedef typename ImplTraits::StreamDataType TokenType;
	typedef typename ImplTraits::StringType StringType;
	typedef typename ImplTraits::StringStreamType StringStreamType;
	typedef typename ImplTraits::template RecognizerType< InputStreamType > RecognizerType;
	typedef typename RecognizerType::RecognizerSharedStateType RecognizerSharedStateType;
	typedef typename ImplTraits::template ExceptionBaseType<InputStreamType> ExceptionBaseType;
	typedef typename ImplTraits::BitsetListType BitsetListType;
	typedef typename ImplTraits::TokenSourceType TokenSourceType;

	typedef typename RecognizerSharedStateType::RuleMemoType RuleMemoType;
	typedef typename RecognizerType::DebugEventListenerType DebuggerType;

	private:
	/** A pointer to the character stream whence this lexer is receiving
	* characters.
	* TODO: I may come back to this and implement charstream outside
	* the input stream as per the java implementation.
	*/
	InputStreamType* m_input;

	public:
	Lexer(ANTLR_UINT32 sizeHint, RecognizerSharedStateType* state);
	Lexer(ANTLR_UINT32 sizeHint, InputStreamType* input, RecognizerSharedStateType* state);

	InputStreamType* get_input() const;
	IntStreamType* get_istream() const;
	RecognizerType* get_rec();
	const RecognizerType* get_rec() const;
	TokenSourceType* get_tokSource();

	//functions used in .stg file
	const RecognizerType* get_recognizer() const;
	RecognizerSharedStateType* get_lexstate() const;
	void set_lexstate( RecognizerSharedStateType* lexstate );
	const TokenSourceType* get_tokSource() const;
	CommonTokenType* get_ltoken() const;
	void set_ltoken( const CommonTokenType* ltoken );
	bool hasFailed() const;
	ANTLR_INT32 get_backtracking() const;
	void inc_backtracking();
	void dec_backtracking();
	bool get_failedflag() const;
	void set_failedflag( bool failed );
	InputStreamType* get_strstream() const;
	ANTLR_MARKER index() const;
	void seek(ANTLR_MARKER index);
	const CommonTokenType* EOF_Token() const;
	bool hasException() const;
	ExceptionBaseType* get_exception() const;
	void constructEx();
	void lrecover();
	ANTLR_MARKER mark();
	void rewind(ANTLR_MARKER marker);
	void rewindLast();
	void setText( const StringType& text );
	void skip();
	RuleMemoType* getRuleMemo() const;
	DebuggerType* get_debugger() const;
	void setRuleMemo(RuleMemoType* rulememo);
	ANTLR_UINT32 LA(ANTLR_INT32 i);
	void consume();
	void memoize(ANTLR_MARKER ruleIndex, ANTLR_MARKER ruleParseStart);
	bool haveParsedRule(ANTLR_MARKER ruleIndex);

	/** Pointer to a function that sets the charstream source for the lexer and
	* causes it to be reset.
	*/
	void setCharStream(InputStreamType* input);

	/*!
	* \brief
	* Change to a new input stream, remembering the old one.
	*
	* \param lexer
	* Pointer to the lexer instance to switch input streams for.
	*
	* \param input
	* New input stream to install as the current one.
	*
	* Switches the current character input stream to
	* a new one, saving the old one, which we will revert to at the end of this
	* new one.
	*/
	void pushCharStream(InputStreamType* input);

	/*!
	* \brief
	* Stops using the current input stream and reverts to any prior
	* input stream on the stack.
	*
	* \param lexer
	* Description of parameter lexer.
	*
	* Pointer to a function that abandons the current input stream, whether it
	* is empty or not and reverts to the previous stacked input stream.
	*
	* \remark
	* The function fails silently if there are no prior input streams.
	*/
	void popCharStream();

	/** Function that emits (a copy of ) the supplied token as the next token in
	* the stream.
	*/
	void emit(const CommonTokenType* token);

	/** Pointer to a function that constructs a new token from the lexer stored information
	*/
	CommonTokenType* emit();

	/** Pointer to a function that attempts to match and consume the specified string from the input
	* stream. Note that strings muse be passed as terminated arrays of ANTLR3_UCHAR. Strings are terminated
	* with 0xFFFFFFFF, which is an invalid UTF32 character
	*/
	bool matchs(ANTLR_UCHAR* string);

	/** Pointer to a function that matches and consumes the specified character from the input stream.
	* The input stream is required to provide characters via LA() as UTF32 characters. The default lexer
	* implementation is source encoding agnostic and so input streams do not generally need to
	* override the default implmentation.
	*/
	bool matchc(ANTLR_UCHAR c);

	/** Pointer to a function that matches any character in the supplied range (I suppose it could be a token range too
	* but this would only be useful if the tokens were in tsome guaranteed order which is
	* only going to happen with a hand crafted token set).
	*/
	bool matchRange(ANTLR_UCHAR low, ANTLR_UCHAR high);

	/** Pointer to a function that matches the next token/char in the input stream
	* regardless of what it actaully is.
	*/
	void matchAny();

	/** Pointer to a function that recovers from an error found in the input stream.
	* Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also
	* be from a mismatched token that the (*match)() could not recover from.
	*/
	void recover();

	/** Function to return the current line number in the input stream
	*/
	ANTLR_UINT32 getLine();
	ANTLR_MARKER getCharIndex();
	ANTLR_UINT32 getCharPositionInLine();

	/** Function to return the text so far for the current token being generated
	*/
	StringType getText();

	//Other utility functions
	void fillExceptionData( ExceptionBaseType* ex );

	/** Default lexer error handler (works for 8 bit streams only!!!)
	*/
	void displayRecognitionError( ANTLR_UINT8** tokenNames, ExceptionBaseType* ex);
	void exConstruct();
	TokenType* getMissingSymbol( IntStreamType* istream, ExceptionBaseType* e,
	ANTLR_UINT32 expectedTokenType, BitsetListType* follow);

	/** Pointer to a function that knows how to free the resources of a lexer
	*/
	~Lexer();
	};

	ANTLR_END_NAMESPACE()

	#include "antlr3lexer.inl"

	#endif