antlr-3.4/runtime/JavaScript/tests/functional/rhino-python.extensions - platform/external/antlr - Git at Google

 /*
  [The "BSD licence"]
  Copyright (c) 2004 Terence Parr and Loring Craymer
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
  are met:
  1. Redistributions of source code must retain the above copyright
     notice, this list of conditions and the following disclaimer.
  2. Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.
  3. The name of the author may not be used to endorse or promote products
     derived from this software without specific prior written permission.

  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */


 /** Python does not explicitly provide begin and end nesting signals.
  Rather, the indentation level indicates when you begin and end.
  This is an interesting lexical problem because multiple DEDENT
  tokens should be sent to the parser sometimes without a corresponding
  input symbol!  Consider the following example:

  a=1
  if a>1:
      print a
  b=3

  Here the "b" token on the left edge signals that a DEDENT is needed
  after the "print a \n" and before the "b".  The sequence should be

  ... 1 COLON NEWLINE INDENT PRINT a NEWLINE DEDENT b ASSIGN 3 ...

  For more examples, see the big comment at the bottom of this file.

  This TokenStream normally just passes tokens through to the parser.
  Upon NEWLINE token from the lexer, however, an INDENT or DEDENT token
  may need to be sent to the parser.  The NEWLINE is the trigger for
  this class to do it's job.  NEWLINE is saved and then the first token
  of the next line is examined.  If non-leading-whitespace token,
  then check against stack for indent vs dedent.  If LEADING_WS, then
  the column of the next non-whitespace token will dictate indent vs
  dedent.  The column of the next real token is number of spaces
  in the LEADING_WS token + 1 (to move past the whitespace).  The
  lexer grammar must set the text of the LEADING_WS token to be
  the proper number of spaces (and do tab conversion etc...).

  A stack of column numbers is tracked and used to detect changes
  in indent level from one token to the next.

  A queue of tokens is built up to hold multiple DEDENT tokens that
  are generated.  Before asking the lexer for another token via
  nextToken(), the queue is flushed first one token at a time.

  Terence Parr and Loring Craymer
  February 2004
  */
 PythonTokenSource = function(stream) {
     this.stream = stream;
 	/** The stack of indent levels (column numbers) */
 	this.indentStack = new Array(PythonTokenSource.MAX_INDENTS);
 	/** stack pointer */
 	this.sp=-1; // grow upwards

 	/** The queue of tokens */
 	this.tokens = [];
 	this.lastTokenAddedIndex = -1;
 	this.push(PythonTokenSource.FIRST_CHAR_POSITION);
 };

 ANTLR.lang.augmentObject(PythonTokenSource, {
 	MAX_INDENTS: 100,
 	FIRST_CHAR_POSITION: 0,
 });

 PythonTokenSource.prototype = {
 	getSourceName: function() {
 		return this.stream.getSourceName();
 	},

 	/** From http://www.python.org/doc/2.2.3/ref/indentation.html

 	 "Before the first line of the file is read, a single zero is
 	 pushed on the stack; this will never be popped off again. The
 	 numbers pushed on the stack will always be strictly increasing
 	 from bottom to top. At the beginning of each logical line, the
 	 line's indentation level is compared to the top of the
 	 stack. If it is equal, nothing happens. If it is larger, it is
 	 pushed on the stack, and one INDENT token is generated. If it
 	 is smaller, it must be one of the numbers occurring on the
 	 stack; all numbers on the stack that are larger are popped
 	 off, and for each number popped off a DEDENT token is
 	 generated. At the end of the file, a DEDENT token is generated
 	 for each number remaining on the stack that is larger than
 	 zero."

 	 I use char position in line 0..n-1 instead.

 	 The DEDENTS possibly needed at EOF are gracefully handled by forcing
 	 EOF to have char pos 0 even though with UNIX it's hard to get EOF
 	 at a non left edge.
 	 */
 	nextToken: function() {
 		// if something in queue, just remove and return it
 		if (this.tokens.length>0 ) {
 			var t = this.tokens[0];
 			this.tokens.splice(0,1);
 			return t;
 		}

 		this.insertImaginaryIndentDedentTokens();

 		return this.nextToken();
 	},

 	insertImaginaryIndentDedentTokens: function()
 	{
 		var t = this.stream.LT(1);
 		this.stream.consume();

 		// if not a NEWLINE, doesn't signal indent/dedent work; just enqueue
 		if ( t.getType()!=PythonLexer.NEWLINE ) {
 			var hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1);
 			if ( hiddenTokens!=null ) {
 				this.tokens = this.tokens.concat(hiddenTokens);
 			}
 			this.lastTokenAddedIndex = t.getTokenIndex();
 			this.tokens.push(t);
 			return;
 		}

 		// save NEWLINE in the queue
 		var hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1);
 		if ( hiddenTokens!=null ) {
 			this.tokens = this.tokens.concat(hiddenTokens);
 		}
 		this.lastTokenAddedIndex = t.getTokenIndex();
 		this.tokens.push(t);

 		// grab first token of next line
 		t = this.stream.LT(1);
 		this.stream.consume();

 		hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1);
 		if ( hiddenTokens!=null ) {
 			this.tokens = this.tokens.concat(hiddenTokens);
 		}
 		this.lastTokenAddedIndex = t.getTokenIndex();

 		// compute cpos as the char pos of next non-WS token in line
 		var cpos = t.getCharPositionInLine(); // column dictates indent/dedent
 		if ( t.getType()==ANTLR.runtime.Token.EOF ) {
 			cpos = -1; // pretend EOF always happens at left edge
 		}
 		else if ( t.getType()==PythonLexer.LEADING_WS ) {
 			cpos = t.getText().length;
 		}

 		// compare to last indent level
 		var lastIndent = this.peek();
 		if ( cpos > lastIndent ) { // they indented; track and gen INDENT
 			this.push(cpos);
 			var indent = new ANTLR.runtime.CommonToken(PythonParser.INDENT, "");
 			indent.setCharPositionInLine(t.getCharPositionInLine());
 			indent.setLine(t.getLine());
 			this.tokens.push(indent);
 		}
 		else if ( cpos < lastIndent ) { // they dedented
 			// how far back did we dedent?
 			var prevIndex = this.findPreviousIndent(cpos);
 			// generate DEDENTs for each indent level we backed up over
 			for (var d=this.sp-1; d>=prevIndex; d--) {
 				var dedent = new ANTLR.runtime.CommonToken(PythonParser.DEDENT, "");
 				dedent.setCharPositionInLine(t.getCharPositionInLine());
 				dedent.setLine(t.getLine());
 				this.tokens.push(dedent);
 			}
 			this.sp = prevIndex; // pop those off indent level
 		}
 		if ( t.getType()!=PythonLexer.LEADING_WS ) { // discard WS
 			this.tokens.push(t);
 		}
 	},

 	//  T O K E N  S T A C K  M E T H O D S

 	push: function(i) {
 		if (this.sp>=PythonTokenSource.MAX_INDENTS) {
 			throw new Error("stack overflow");
 		}
 		this.sp++;
 		this.indentStack[this.sp] = i;
 	},

 	pop: function() {
 		if (this.sp<0) {
 			throw new Error("stack underflow");
 		}
 		var top = this.indentStack[this.sp];
 		this.sp--;
 		return top;
 	},

 	peek: function() {
 		return this.indentStack[this.sp];
 	},

 	/** Return the index on stack of previous indent level == i else -1 */
 	findPreviousIndent: function(i) {
 		for (var j=this.sp-1; j>=0; j--) {
 			if (this.indentStack[j]==i ) {
 				return j;
 			}
 		}
 		return PythonTokenSource.FIRST_CHAR_POSITION;
 	},

 	stackString: function() {
 		var buf = [];
 		for (var j=this.sp; j>=0; j--) {
 			buf.push(this.indentStack[j]);
 		}
 		return buf.join(" ");
 	}

 }

 /* More example input / output pairs with code simplified to single chars
 ------- t1 -------
 a a
         b b
         c
 d
 a a \n INDENT b b \n c \n DEDENT d \n EOF
 ------- t2 -------
 a  c
  b
 c
 a c \n INDENT b \n DEDENT c \n EOF
 ------- t3 -------
 a
         b
                 c
 d
 a \n INDENT b \n INDENT c \n DEDENT DEDENT d \n EOF
 ------- t4 -------
 a
     c
                   d
     e
     f
              g
              h
              i
               j
     k
 a \n INDENT c \n INDENT d \n DEDENT e \n f \n INDENT g \n h \n i \n INDENT j \n DEDENT DEDENT k \n DEDENT EOF
 ------- t5 -------
 a
         b
         c
                 d
                 e
 a \n INDENT b \n c \n INDENT d \n e \n DEDENT DEDENT EOF
 */
	/*
	[The "BSD licence"]
	Copyright (c) 2004 Terence Parr and Loring Craymer
	All rights reserved.

	Redistribution and use in source and binary forms, with or without
	modification, are permitted provided that the following conditions
	are met:
	1. Redistributions of source code must retain the above copyright
	notice, this list of conditions and the following disclaimer.
	2. Redistributions in binary form must reproduce the above copyright
	notice, this list of conditions and the following disclaimer in the
	documentation and/or other materials provided with the distribution.
	3. The name of the author may not be used to endorse or promote products
	derived from this software without specific prior written permission.

	THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/


	/** Python does not explicitly provide begin and end nesting signals.
	Rather, the indentation level indicates when you begin and end.
	This is an interesting lexical problem because multiple DEDENT
	tokens should be sent to the parser sometimes without a corresponding
	input symbol! Consider the following example:

	a=1
	if a>1:
	print a
	b=3

	Here the "b" token on the left edge signals that a DEDENT is needed
	after the "print a \n" and before the "b". The sequence should be

	... 1 COLON NEWLINE INDENT PRINT a NEWLINE DEDENT b ASSIGN 3 ...

	For more examples, see the big comment at the bottom of this file.

	This TokenStream normally just passes tokens through to the parser.
	Upon NEWLINE token from the lexer, however, an INDENT or DEDENT token
	may need to be sent to the parser. The NEWLINE is the trigger for
	this class to do it's job. NEWLINE is saved and then the first token
	of the next line is examined. If non-leading-whitespace token,
	then check against stack for indent vs dedent. If LEADING_WS, then
	the column of the next non-whitespace token will dictate indent vs
	dedent. The column of the next real token is number of spaces
	in the LEADING_WS token + 1 (to move past the whitespace). The
	lexer grammar must set the text of the LEADING_WS token to be
	the proper number of spaces (and do tab conversion etc...).

	A stack of column numbers is tracked and used to detect changes
	in indent level from one token to the next.

	A queue of tokens is built up to hold multiple DEDENT tokens that
	are generated. Before asking the lexer for another token via
	nextToken(), the queue is flushed first one token at a time.

	Terence Parr and Loring Craymer
	February 2004
	*/
	PythonTokenSource = function(stream) {
	this.stream = stream;
	/** The stack of indent levels (column numbers) */
	this.indentStack = new Array(PythonTokenSource.MAX_INDENTS);
	/** stack pointer */
	this.sp=-1; // grow upwards

	/** The queue of tokens */
	this.tokens = [];
	this.lastTokenAddedIndex = -1;
	this.push(PythonTokenSource.FIRST_CHAR_POSITION);
	};

	ANTLR.lang.augmentObject(PythonTokenSource, {
	MAX_INDENTS: 100,
	FIRST_CHAR_POSITION: 0,
	});

	PythonTokenSource.prototype = {
	getSourceName: function() {
	return this.stream.getSourceName();
	},

	/** From http://www.python.org/doc/2.2.3/ref/indentation.html

	"Before the first line of the file is read, a single zero is
	pushed on the stack; this will never be popped off again. The
	numbers pushed on the stack will always be strictly increasing
	from bottom to top. At the beginning of each logical line, the
	line's indentation level is compared to the top of the
	stack. If it is equal, nothing happens. If it is larger, it is
	pushed on the stack, and one INDENT token is generated. If it
	is smaller, it must be one of the numbers occurring on the
	stack; all numbers on the stack that are larger are popped
	off, and for each number popped off a DEDENT token is
	generated. At the end of the file, a DEDENT token is generated
	for each number remaining on the stack that is larger than
	zero."

	I use char position in line 0..n-1 instead.

	The DEDENTS possibly needed at EOF are gracefully handled by forcing
	EOF to have char pos 0 even though with UNIX it's hard to get EOF
	at a non left edge.
	*/
	nextToken: function() {
	// if something in queue, just remove and return it
	if (this.tokens.length>0 ) {
	var t = this.tokens[0];
	this.tokens.splice(0,1);
	return t;
	}

	this.insertImaginaryIndentDedentTokens();

	return this.nextToken();
	},

	insertImaginaryIndentDedentTokens: function()
	{
	var t = this.stream.LT(1);
	this.stream.consume();

	// if not a NEWLINE, doesn't signal indent/dedent work; just enqueue
	if ( t.getType()!=PythonLexer.NEWLINE ) {
	var hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1);
	if ( hiddenTokens!=null ) {
	this.tokens = this.tokens.concat(hiddenTokens);
	}
	this.lastTokenAddedIndex = t.getTokenIndex();
	this.tokens.push(t);
	return;
	}

	// save NEWLINE in the queue
	var hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1);
	if ( hiddenTokens!=null ) {
	this.tokens = this.tokens.concat(hiddenTokens);
	}
	this.lastTokenAddedIndex = t.getTokenIndex();
	this.tokens.push(t);

	// grab first token of next line
	t = this.stream.LT(1);
	this.stream.consume();

	hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1);
	if ( hiddenTokens!=null ) {
	this.tokens = this.tokens.concat(hiddenTokens);
	}
	this.lastTokenAddedIndex = t.getTokenIndex();

	// compute cpos as the char pos of next non-WS token in line
	var cpos = t.getCharPositionInLine(); // column dictates indent/dedent
	if ( t.getType()==ANTLR.runtime.Token.EOF ) {
	cpos = -1; // pretend EOF always happens at left edge
	}
	else if ( t.getType()==PythonLexer.LEADING_WS ) {
	cpos = t.getText().length;
	}

	// compare to last indent level
	var lastIndent = this.peek();
	if ( cpos > lastIndent ) { // they indented; track and gen INDENT
	this.push(cpos);
	var indent = new ANTLR.runtime.CommonToken(PythonParser.INDENT, "");
	indent.setCharPositionInLine(t.getCharPositionInLine());
	indent.setLine(t.getLine());
	this.tokens.push(indent);
	}
	else if ( cpos < lastIndent ) { // they dedented
	// how far back did we dedent?
	var prevIndex = this.findPreviousIndent(cpos);
	// generate DEDENTs for each indent level we backed up over
	for (var d=this.sp-1; d>=prevIndex; d--) {
	var dedent = new ANTLR.runtime.CommonToken(PythonParser.DEDENT, "");
	dedent.setCharPositionInLine(t.getCharPositionInLine());
	dedent.setLine(t.getLine());
	this.tokens.push(dedent);
	}
	this.sp = prevIndex; // pop those off indent level
	}
	if ( t.getType()!=PythonLexer.LEADING_WS ) { // discard WS
	this.tokens.push(t);
	}
	},

	// T O K E N S T A C K M E T H O D S

	push: function(i) {
	if (this.sp>=PythonTokenSource.MAX_INDENTS) {
	throw new Error("stack overflow");
	}
	this.sp++;
	this.indentStack[this.sp] = i;
	},

	pop: function() {
	if (this.sp<0) {
	throw new Error("stack underflow");
	}
	var top = this.indentStack[this.sp];
	this.sp--;
	return top;
	},

	peek: function() {
	return this.indentStack[this.sp];
	},

	/** Return the index on stack of previous indent level == i else -1 */
	findPreviousIndent: function(i) {
	for (var j=this.sp-1; j>=0; j--) {
	if (this.indentStack[j]==i ) {
	return j;
	}
	}
	return PythonTokenSource.FIRST_CHAR_POSITION;
	},

	stackString: function() {
	var buf = [];
	for (var j=this.sp; j>=0; j--) {
	buf.push(this.indentStack[j]);
	}
	return buf.join(" ");
	}

	}

	/* More example input / output pairs with code simplified to single chars
	------- t1 -------
	a a
	b b
	c
	d
	a a \n INDENT b b \n c \n DEDENT d \n EOF
	------- t2 -------
	a c
	b
	c
	a c \n INDENT b \n DEDENT c \n EOF
	------- t3 -------
	a
	b
	c
	d
	a \n INDENT b \n INDENT c \n DEDENT DEDENT d \n EOF
	------- t4 -------
	a
	c
	d
	e
	f
	g
	h
	i
	j
	k
	a \n INDENT c \n INDENT d \n DEDENT e \n f \n INDENT g \n h \n i \n INDENT j \n DEDENT DEDENT k \n DEDENT EOF
	------- t5 -------
	a
	b
	c
	d
	e
	a \n INDENT b \n c \n INDENT d \n e \n DEDENT DEDENT EOF
	*/