syntax/scan.go - platform/external/starlark-go - Git at Google

 // Copyright 2017 The Bazel Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package syntax

 // A lexical scanner for Starlark.

 import (
 	"fmt"
 	"io"
 	"io/ioutil"
 	"log"
 	"math/big"
 	"os"
 	"strconv"
 	"strings"
 	"unicode"
 	"unicode/utf8"
 )

 // A Token represents a Starlark lexical token.
 type Token int8

 const (
 	ILLEGAL Token = iota
 	EOF

 	NEWLINE
 	INDENT
 	OUTDENT

 	// Tokens with values
 	IDENT  // x
 	INT    // 123
 	FLOAT  // 1.23e45
 	STRING // "foo" or 'foo' or '''foo''' or r'foo' or r"foo"
 	BYTES  // b"foo", etc

 	// Punctuation
 	PLUS          // +
 	MINUS         // -
 	STAR          // *
 	SLASH         // /
 	SLASHSLASH    // //
 	PERCENT       // %
 	AMP           // &
 	PIPE          // |
 	CIRCUMFLEX    // ^
 	LTLT          // <<
 	GTGT          // >>
 	TILDE         // ~
 	DOT           // .
 	COMMA         // ,
 	EQ            // =
 	SEMI          // ;
 	COLON         // :
 	LPAREN        // (
 	RPAREN        // )
 	LBRACK        // [
 	RBRACK        // ]
 	LBRACE        // {
 	RBRACE        // }
 	LT            // <
 	GT            // >
 	GE            // >=
 	LE            // <=
 	EQL           // ==
 	NEQ           // !=
 	PLUS_EQ       // +=    (keep order consistent with PLUS..GTGT)
 	MINUS_EQ      // -=
 	STAR_EQ       // *=
 	SLASH_EQ      // /=
 	SLASHSLASH_EQ // //=
 	PERCENT_EQ    // %=
 	AMP_EQ        // &=
 	PIPE_EQ       // |=
 	CIRCUMFLEX_EQ // ^=
 	LTLT_EQ       // <<=
 	GTGT_EQ       // >>=
 	STARSTAR      // **

 	// Keywords
 	AND
 	BREAK
 	CONTINUE
 	DEF
 	ELIF
 	ELSE
 	FOR
 	IF
 	IN
 	LAMBDA
 	LOAD
 	NOT
 	NOT_IN // synthesized by parser from NOT IN
 	OR
 	PASS
 	RETURN
 	WHILE

 	maxToken
 )

 func (tok Token) String() string { return tokenNames[tok] }

 // GoString is like String but quotes punctuation tokens.
 // Use Sprintf("%#v", tok) when constructing error messages.
 func (tok Token) GoString() string {
 	if tok >= PLUS && tok <= STARSTAR {
 		return "'" + tokenNames[tok] + "'"
 	}
 	return tokenNames[tok]
 }

 var tokenNames = [...]string{
 	ILLEGAL:       "illegal token",
 	EOF:           "end of file",
 	NEWLINE:       "newline",
 	INDENT:        "indent",
 	OUTDENT:       "outdent",
 	IDENT:         "identifier",
 	INT:           "int literal",
 	FLOAT:         "float literal",
 	STRING:        "string literal",
 	PLUS:          "+",
 	MINUS:         "-",
 	STAR:          "*",
 	SLASH:         "/",
 	SLASHSLASH:    "//",
 	PERCENT:       "%",
 	AMP:           "&",
 	PIPE:          "|",
 	CIRCUMFLEX:    "^",
 	LTLT:          "<<",
 	GTGT:          ">>",
 	TILDE:         "~",
 	DOT:           ".",
 	COMMA:         ",",
 	EQ:            "=",
 	SEMI:          ";",
 	COLON:         ":",
 	LPAREN:        "(",
 	RPAREN:        ")",
 	LBRACK:        "[",
 	RBRACK:        "]",
 	LBRACE:        "{",
 	RBRACE:        "}",
 	LT:            "<",
 	GT:            ">",
 	GE:            ">=",
 	LE:            "<=",
 	EQL:           "==",
 	NEQ:           "!=",
 	PLUS_EQ:       "+=",
 	MINUS_EQ:      "-=",
 	STAR_EQ:       "*=",
 	SLASH_EQ:      "/=",
 	SLASHSLASH_EQ: "//=",
 	PERCENT_EQ:    "%=",
 	AMP_EQ:        "&=",
 	PIPE_EQ:       "|=",
 	CIRCUMFLEX_EQ: "^=",
 	LTLT_EQ:       "<<=",
 	GTGT_EQ:       ">>=",
 	STARSTAR:      "**",
 	AND:           "and",
 	BREAK:         "break",
 	CONTINUE:      "continue",
 	DEF:           "def",
 	ELIF:          "elif",
 	ELSE:          "else",
 	FOR:           "for",
 	IF:            "if",
 	IN:            "in",
 	LAMBDA:        "lambda",
 	LOAD:          "load",
 	NOT:           "not",
 	NOT_IN:        "not in",
 	OR:            "or",
 	PASS:          "pass",
 	RETURN:        "return",
 	WHILE:         "while",
 }

 // A FilePortion describes the content of a portion of a file.
 // Callers may provide a FilePortion for the src argument of Parse
 // when the desired initial line and column numbers are not (1, 1),
 // such as when an expression is parsed from within larger file.
 type FilePortion struct {
 	Content             []byte
 	FirstLine, FirstCol int32
 }

 // A Position describes the location of a rune of input.
 type Position struct {
 	file *string // filename (indirect for compactness)
 	Line int32   // 1-based line number; 0 if line unknown
 	Col  int32   // 1-based column (rune) number; 0 if column unknown
 }

 // IsValid reports whether the position is valid.
 func (p Position) IsValid() bool { return p.file != nil }

 // Filename returns the name of the file containing this position.
 func (p Position) Filename() string {
 	if p.file != nil {
 		return *p.file
 	}
 	return "<invalid>"
 }

 // MakePosition returns position with the specified components.
 func MakePosition(file *string, line, col int32) Position { return Position{file, line, col} }

 // add returns the position at the end of s, assuming it starts at p.
 func (p Position) add(s string) Position {
 	if n := strings.Count(s, "\n"); n > 0 {
 		p.Line += int32(n)
 		s = s[strings.LastIndex(s, "\n")+1:]
 		p.Col = 1
 	}
 	p.Col += int32(utf8.RuneCountInString(s))
 	return p
 }

 func (p Position) String() string {
 	file := p.Filename()
 	if p.Line > 0 {
 		if p.Col > 0 {
 			return fmt.Sprintf("%s:%d:%d", file, p.Line, p.Col)
 		}
 		return fmt.Sprintf("%s:%d", file, p.Line)
 	}
 	return file
 }

 func (p Position) isBefore(q Position) bool {
 	if p.Line != q.Line {
 		return p.Line < q.Line
 	}
 	return p.Col < q.Col
 }

 // An scanner represents a single input file being parsed.
 type scanner struct {
 	rest           []byte    // rest of input (in REPL, a line of input)
 	token          []byte    // token being scanned
 	pos            Position  // current input position
 	depth          int       // nesting of [ ] { } ( )
 	indentstk      []int     // stack of indentation levels
 	dents          int       // number of saved INDENT (>0) or OUTDENT (<0) tokens to return
 	lineStart      bool      // after NEWLINE; convert spaces to indentation tokens
 	keepComments   bool      // accumulate comments in slice
 	lineComments   []Comment // list of full line comments (if keepComments)
 	suffixComments []Comment // list of suffix comments (if keepComments)

 	readline func() ([]byte, error) // read next line of input (REPL only)
 }

 func newScanner(filename string, src interface{}, keepComments bool) (*scanner, error) {
 	var firstLine, firstCol int32 = 1, 1
 	if portion, ok := src.(FilePortion); ok {
 		firstLine, firstCol = portion.FirstLine, portion.FirstCol
 	}
 	sc := &scanner{
 		pos:          MakePosition(&filename, firstLine, firstCol),
 		indentstk:    make([]int, 1, 10), // []int{0} + spare capacity
 		lineStart:    true,
 		keepComments: keepComments,
 	}
 	sc.readline, _ = src.(func() ([]byte, error)) // ParseCompoundStmt (REPL) only
 	if sc.readline == nil {
 		data, err := readSource(filename, src)
 		if err != nil {
 			return nil, err
 		}
 		sc.rest = data
 	}
 	return sc, nil
 }

 func readSource(filename string, src interface{}) ([]byte, error) {
 	switch src := src.(type) {
 	case string:
 		return []byte(src), nil
 	case []byte:
 		return src, nil
 	case io.Reader:
 		data, err := ioutil.ReadAll(src)
 		if err != nil {
 			err = &os.PathError{Op: "read", Path: filename, Err: err}
 			return nil, err
 		}
 		return data, nil
 	case FilePortion:
 		return src.Content, nil
 	case nil:
 		return ioutil.ReadFile(filename)
 	default:
 		return nil, fmt.Errorf("invalid source: %T", src)
 	}
 }

 // An Error describes the nature and position of a scanner or parser error.
 type Error struct {
 	Pos Position
 	Msg string
 }

 func (e Error) Error() string { return e.Pos.String() + ": " + e.Msg }

 // errorf is called to report an error.
 // errorf does not return: it panics.
 func (sc *scanner) error(pos Position, s string) {
 	panic(Error{pos, s})
 }

 func (sc *scanner) errorf(pos Position, format string, args ...interface{}) {
 	sc.error(pos, fmt.Sprintf(format, args...))
 }

 func (sc *scanner) recover(err *error) {
 	// The scanner and parser panic both for routine errors like
 	// syntax errors and for programmer bugs like array index
 	// errors.  Turn both into error returns.  Catching bug panics
 	// is especially important when processing many files.
 	switch e := recover().(type) {
 	case nil:
 		// no panic
 	case Error:
 		*err = e
 	default:
 		*err = Error{sc.pos, fmt.Sprintf("internal error: %v", e)}
 		if debug {
 			log.Fatal(*err)
 		}
 	}
 }

 // eof reports whether the input has reached end of file.
 func (sc *scanner) eof() bool {
 	return len(sc.rest) == 0 && !sc.readLine()
 }

 // readLine attempts to read another line of input.
 // Precondition: len(sc.rest)==0.
 func (sc *scanner) readLine() bool {
 	if sc.readline != nil {
 		var err error
 		sc.rest, err = sc.readline()
 		if err != nil {
 			sc.errorf(sc.pos, "%v", err) // EOF or ErrInterrupt
 		}
 		return len(sc.rest) > 0
 	}
 	return false
 }

 // peekRune returns the next rune in the input without consuming it.
 // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
 func (sc *scanner) peekRune() rune {
 	// TODO(adonovan): opt: measure and perhaps inline eof.
 	if sc.eof() {
 		return 0
 	}

 	// fast path: ASCII
 	if b := sc.rest[0]; b < utf8.RuneSelf {
 		if b == '\r' {
 			return '\n'
 		}
 		return rune(b)
 	}

 	r, _ := utf8.DecodeRune(sc.rest)
 	return r
 }

 // readRune consumes and returns the next rune in the input.
 // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
 func (sc *scanner) readRune() rune {
 	// eof() has been inlined here, both to avoid a call
 	// and to establish len(rest)>0 to avoid a bounds check.
 	if len(sc.rest) == 0 {
 		if !sc.readLine() {
 			sc.error(sc.pos, "internal scanner error: readRune at EOF")
 		}
 		// Redundant, but eliminates the bounds-check below.
 		if len(sc.rest) == 0 {
 			return 0
 		}
 	}

 	// fast path: ASCII
 	if b := sc.rest[0]; b < utf8.RuneSelf {
 		r := rune(b)
 		sc.rest = sc.rest[1:]
 		if r == '\r' {
 			if len(sc.rest) > 0 && sc.rest[0] == '\n' {
 				sc.rest = sc.rest[1:]
 			}
 			r = '\n'
 		}
 		if r == '\n' {
 			sc.pos.Line++
 			sc.pos.Col = 1
 		} else {
 			sc.pos.Col++
 		}
 		return r
 	}

 	r, size := utf8.DecodeRune(sc.rest)
 	sc.rest = sc.rest[size:]
 	sc.pos.Col++
 	return r
 }

 // tokenValue records the position and value associated with each token.
 type tokenValue struct {
 	raw    string   // raw text of token
 	int    int64    // decoded int
 	bigInt *big.Int // decoded integers > int64
 	float  float64  // decoded float
 	string string   // decoded string or bytes
 	pos    Position // start position of token
 }

 // startToken marks the beginning of the next input token.
 // It must be followed by a call to endToken once the token has
 // been consumed using readRune.
 func (sc *scanner) startToken(val *tokenValue) {
 	sc.token = sc.rest
 	val.raw = ""
 	val.pos = sc.pos
 }

 // endToken marks the end of an input token.
 // It records the actual token string in val.raw if the caller
 // has not done that already.
 func (sc *scanner) endToken(val *tokenValue) {
 	if val.raw == "" {
 		val.raw = string(sc.token[:len(sc.token)-len(sc.rest)])
 	}
 }

 // nextToken is called by the parser to obtain the next input token.
 // It returns the token value and sets val to the data associated with
 // the token.
 //
 // For all our input tokens, the associated data is val.pos (the
 // position where the token begins), val.raw (the input string
 // corresponding to the token).  For string and int tokens, the string
 // and int fields additionally contain the token's interpreted value.
 func (sc *scanner) nextToken(val *tokenValue) Token {

 	// The following distribution of tokens guides case ordering:
 	//
 	//      COMMA          27   %
 	//      STRING         23   %
 	//      IDENT          15   %
 	//      EQL            11   %
 	//      LBRACK          5.5 %
 	//      RBRACK          5.5 %
 	//      NEWLINE         3   %
 	//      LPAREN          2.9 %
 	//      RPAREN          2.9 %
 	//      INT             2   %
 	//      others        < 1   %
 	//
 	// Although NEWLINE tokens are infrequent, and lineStart is
 	// usually (~97%) false on entry, skipped newlines account for
 	// about 50% of all iterations of the 'start' loop.

 start:
 	var c rune

 	// Deal with leading spaces and indentation.
 	blank := false
 	savedLineStart := sc.lineStart
 	if sc.lineStart {
 		sc.lineStart = false
 		col := 0
 		for {
 			c = sc.peekRune()
 			if c == ' ' {
 				col++
 				sc.readRune()
 			} else if c == '\t' {
 				const tab = 8
 				col += int(tab - (sc.pos.Col-1)%tab)
 				sc.readRune()
 			} else {
 				break
 			}
 		}

 		// The third clause matches EOF.
 		if c == '#' || c == '\n' || c == 0 {
 			blank = true
 		}

 		// Compute indentation level for non-blank lines not
 		// inside an expression.  This is not the common case.
 		if !blank && sc.depth == 0 {
 			cur := sc.indentstk[len(sc.indentstk)-1]
 			if col > cur {
 				// indent
 				sc.dents++
 				sc.indentstk = append(sc.indentstk, col)
 			} else if col < cur {
 				// outdent(s)
 				for len(sc.indentstk) > 0 && col < sc.indentstk[len(sc.indentstk)-1] {
 					sc.dents--
 					sc.indentstk = sc.indentstk[:len(sc.indentstk)-1] // pop
 				}
 				if col != sc.indentstk[len(sc.indentstk)-1] {
 					sc.error(sc.pos, "unindent does not match any outer indentation level")
 				}
 			}
 		}
 	}

 	// Return saved indentation tokens.
 	if sc.dents != 0 {
 		sc.startToken(val)
 		sc.endToken(val)
 		if sc.dents < 0 {
 			sc.dents++
 			return OUTDENT
 		} else {
 			sc.dents--
 			return INDENT
 		}
 	}

 	// start of line proper
 	c = sc.peekRune()

 	// Skip spaces.
 	for c == ' ' || c == '\t' {
 		sc.readRune()
 		c = sc.peekRune()
 	}

 	// comment
 	if c == '#' {
 		if sc.keepComments {
 			sc.startToken(val)
 		}
 		// Consume up to newline (included).
 		for c != 0 && c != '\n' {
 			sc.readRune()
 			c = sc.peekRune()
 		}
 		if sc.keepComments {
 			sc.endToken(val)
 			if blank {
 				sc.lineComments = append(sc.lineComments, Comment{val.pos, val.raw})
 			} else {
 				sc.suffixComments = append(sc.suffixComments, Comment{val.pos, val.raw})
 			}
 		}
 	}

 	// newline
 	if c == '\n' {
 		sc.lineStart = true

 		// Ignore newlines within expressions (common case).
 		if sc.depth > 0 {
 			sc.readRune()
 			goto start
 		}

 		// Ignore blank lines, except in the REPL,
 		// where they emit OUTDENTs and NEWLINE.
 		if blank {
 			if sc.readline == nil {
 				sc.readRune()
 				goto start
 			} else if len(sc.indentstk) > 1 {
 				sc.dents = 1 - len(sc.indentstk)
 				sc.indentstk = sc.indentstk[:1]
 				goto start
 			}
 		}

 		// At top-level (not in an expression).
 		sc.startToken(val)
 		sc.readRune()
 		val.raw = "\n"
 		return NEWLINE
 	}

 	// end of file
 	if c == 0 {
 		// Emit OUTDENTs for unfinished indentation,
 		// preceded by a NEWLINE if we haven't just emitted one.
 		if len(sc.indentstk) > 1 {
 			if savedLineStart {
 				sc.dents = 1 - len(sc.indentstk)
 				sc.indentstk = sc.indentstk[:1]
 				goto start
 			} else {
 				sc.lineStart = true
 				sc.startToken(val)
 				val.raw = "\n"
 				return NEWLINE
 			}
 		}

 		sc.startToken(val)
 		sc.endToken(val)
 		return EOF
 	}

 	// line continuation
 	if c == '\\' {
 		sc.readRune()
 		if sc.peekRune() != '\n' {
 			sc.errorf(sc.pos, "stray backslash in program")
 		}
 		sc.readRune()
 		goto start
 	}

 	// start of the next token
 	sc.startToken(val)

 	// comma (common case)
 	if c == ',' {
 		sc.readRune()
 		sc.endToken(val)
 		return COMMA
 	}

 	// string literal
 	if c == '"' || c == '\'' {
 		return sc.scanString(val, c)
 	}

 	// identifier or keyword
 	if isIdentStart(c) {
 		if (c == 'r' || c == 'b') && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') {
 			//  r"..."
 			//  b"..."
 			sc.readRune()
 			c = sc.peekRune()
 			return sc.scanString(val, c)
 		} else if c == 'r' && len(sc.rest) > 2 && sc.rest[1] == 'b' && (sc.rest[2] == '"' || sc.rest[2] == '\'') {
 			// rb"..."
 			sc.readRune()
 			sc.readRune()
 			c = sc.peekRune()
 			return sc.scanString(val, c)
 		}

 		for isIdent(c) {
 			sc.readRune()
 			c = sc.peekRune()
 		}
 		sc.endToken(val)
 		if k, ok := keywordToken[val.raw]; ok {
 			return k
 		}

 		return IDENT
 	}

 	// brackets
 	switch c {
 	case '[', '(', '{':
 		sc.depth++
 		sc.readRune()
 		sc.endToken(val)
 		switch c {
 		case '[':
 			return LBRACK
 		case '(':
 			return LPAREN
 		case '{':
 			return LBRACE
 		}
 		panic("unreachable")

 	case ']', ')', '}':
 		if sc.depth == 0 {
 			sc.errorf(sc.pos, "unexpected %q", c)
 		} else {
 			sc.depth--
 		}
 		sc.readRune()
 		sc.endToken(val)
 		switch c {
 		case ']':
 			return RBRACK
 		case ')':
 			return RPAREN
 		case '}':
 			return RBRACE
 		}
 		panic("unreachable")
 	}

 	// int or float literal, or period
 	if isdigit(c) || c == '.' {
 		return sc.scanNumber(val, c)
 	}

 	// other punctuation
 	defer sc.endToken(val)
 	switch c {
 	case '=', '<', '>', '!', '+', '-', '%', '/', '&', '|', '^': // possibly followed by '='
 		start := sc.pos
 		sc.readRune()
 		if sc.peekRune() == '=' {
 			sc.readRune()
 			switch c {
 			case '<':
 				return LE
 			case '>':
 				return GE
 			case '=':
 				return EQL
 			case '!':
 				return NEQ
 			case '+':
 				return PLUS_EQ
 			case '-':
 				return MINUS_EQ
 			case '/':
 				return SLASH_EQ
 			case '%':
 				return PERCENT_EQ
 			case '&':
 				return AMP_EQ
 			case '|':
 				return PIPE_EQ
 			case '^':
 				return CIRCUMFLEX_EQ
 			}
 		}
 		switch c {
 		case '=':
 			return EQ
 		case '<':
 			if sc.peekRune() == '<' {
 				sc.readRune()
 				if sc.peekRune() == '=' {
 					sc.readRune()
 					return LTLT_EQ
 				} else {
 					return LTLT
 				}
 			}
 			return LT
 		case '>':
 			if sc.peekRune() == '>' {
 				sc.readRune()
 				if sc.peekRune() == '=' {
 					sc.readRune()
 					return GTGT_EQ
 				} else {
 					return GTGT
 				}
 			}
 			return GT
 		case '!':
 			sc.error(start, "unexpected input character '!'")
 		case '+':
 			return PLUS
 		case '-':
 			return MINUS
 		case '/':
 			if sc.peekRune() == '/' {
 				sc.readRune()
 				if sc.peekRune() == '=' {
 					sc.readRune()
 					return SLASHSLASH_EQ
 				} else {
 					return SLASHSLASH
 				}
 			}
 			return SLASH
 		case '%':
 			return PERCENT
 		case '&':
 			return AMP
 		case '|':
 			return PIPE
 		case '^':
 			return CIRCUMFLEX
 		}
 		panic("unreachable")

 	case ':', ';', '~': // single-char tokens (except comma)
 		sc.readRune()
 		switch c {
 		case ':':
 			return COLON
 		case ';':
 			return SEMI
 		case '~':
 			return TILDE
 		}
 		panic("unreachable")

 	case '*': // possibly followed by '*' or '='
 		sc.readRune()
 		switch sc.peekRune() {
 		case '*':
 			sc.readRune()
 			return STARSTAR
 		case '=':
 			sc.readRune()
 			return STAR_EQ
 		}
 		return STAR
 	}

 	sc.errorf(sc.pos, "unexpected input character %#q", c)
 	panic("unreachable")
 }

 func (sc *scanner) scanString(val *tokenValue, quote rune) Token {
 	start := sc.pos
 	triple := len(sc.rest) >= 3 && sc.rest[0] == byte(quote) && sc.rest[1] == byte(quote) && sc.rest[2] == byte(quote)
 	sc.readRune()

 	// String literals may contain escaped or unescaped newlines,
 	// causing them to span multiple lines (gulps) of REPL input;
 	// they are the only such token. Thus we cannot call endToken,
 	// as it assumes sc.rest is unchanged since startToken.
 	// Instead, buffer the token here.
 	// TODO(adonovan): opt: buffer only if we encounter a newline.
 	raw := new(strings.Builder)

 	// Copy the prefix, e.g. r' or " (see startToken).
 	raw.Write(sc.token[:len(sc.token)-len(sc.rest)])

 	if !triple {
 		// single-quoted string literal
 		for {
 			if sc.eof() {
 				sc.error(val.pos, "unexpected EOF in string")
 			}
 			c := sc.readRune()
 			raw.WriteRune(c)
 			if c == quote {
 				break
 			}
 			if c == '\n' {
 				sc.error(val.pos, "unexpected newline in string")
 			}
 			if c == '\\' {
 				if sc.eof() {
 					sc.error(val.pos, "unexpected EOF in string")
 				}
 				c = sc.readRune()
 				raw.WriteRune(c)
 			}
 		}
 	} else {
 		// triple-quoted string literal
 		sc.readRune()
 		raw.WriteRune(quote)
 		sc.readRune()
 		raw.WriteRune(quote)

 		quoteCount := 0
 		for {
 			if sc.eof() {
 				sc.error(val.pos, "unexpected EOF in string")
 			}
 			c := sc.readRune()
 			raw.WriteRune(c)
 			if c == quote {
 				quoteCount++
 				if quoteCount == 3 {
 					break
 				}
 			} else {
 				quoteCount = 0
 			}
 			if c == '\\' {
 				if sc.eof() {
 					sc.error(val.pos, "unexpected EOF in string")
 				}
 				c = sc.readRune()
 				raw.WriteRune(c)
 			}
 		}
 	}
 	val.raw = raw.String()

 	s, _, isByte, err := unquote(val.raw)
 	if err != nil {
 		sc.error(start, err.Error())
 	}
 	val.string = s
 	if isByte {
 		return BYTES
 	} else {
 		return STRING
 	}
 }

 func (sc *scanner) scanNumber(val *tokenValue, c rune) Token {
 	// https://github.com/google/starlark-go/blob/master/doc/spec.md#lexical-elements
 	//
 	// Python features not supported:
 	// - integer literals of >64 bits of precision
 	// - 123L or 123l long suffix
 	// - traditional octal: 0755
 	// https://docs.python.org/2/reference/lexical_analysis.html#integer-and-long-integer-literals

 	start := sc.pos
 	fraction, exponent := false, false

 	if c == '.' {
 		// dot or start of fraction
 		sc.readRune()
 		c = sc.peekRune()
 		if !isdigit(c) {
 			sc.endToken(val)
 			return DOT
 		}
 		fraction = true
 	} else if c == '0' {
 		// hex, octal, binary or float
 		sc.readRune()
 		c = sc.peekRune()

 		if c == '.' {
 			fraction = true
 		} else if c == 'x' || c == 'X' {
 			// hex
 			sc.readRune()
 			c = sc.peekRune()
 			if !isxdigit(c) {
 				sc.error(start, "invalid hex literal")
 			}
 			for isxdigit(c) {
 				sc.readRune()
 				c = sc.peekRune()
 			}
 		} else if c == 'o' || c == 'O' {
 			// octal
 			sc.readRune()
 			c = sc.peekRune()
 			if !isodigit(c) {
 				sc.error(sc.pos, "invalid octal literal")
 			}
 			for isodigit(c) {
 				sc.readRune()
 				c = sc.peekRune()
 			}
 		} else if c == 'b' || c == 'B' {
 			// binary
 			sc.readRune()
 			c = sc.peekRune()
 			if !isbdigit(c) {
 				sc.error(sc.pos, "invalid binary literal")
 			}
 			for isbdigit(c) {
 				sc.readRune()
 				c = sc.peekRune()
 			}
 		} else {
 			// float (or obsolete octal "0755")
 			allzeros, octal := true, true
 			for isdigit(c) {
 				if c != '0' {
 					allzeros = false
 				}
 				if c > '7' {
 					octal = false
 				}
 				sc.readRune()
 				c = sc.peekRune()
 			}
 			if c == '.' {
 				fraction = true
 			} else if c == 'e' || c == 'E' {
 				exponent = true
 			} else if octal && !allzeros {
 				sc.endToken(val)
 				sc.errorf(sc.pos, "obsolete form of octal literal; use 0o%s", val.raw[1:])
 			}
 		}
 	} else {
 		// decimal
 		for isdigit(c) {
 			sc.readRune()
 			c = sc.peekRune()
 		}

 		if c == '.' {
 			fraction = true
 		} else if c == 'e' || c == 'E' {
 			exponent = true
 		}
 	}

 	if fraction {
 		sc.readRune() // consume '.'
 		c = sc.peekRune()
 		for isdigit(c) {
 			sc.readRune()
 			c = sc.peekRune()
 		}

 		if c == 'e' || c == 'E' {
 			exponent = true
 		}
 	}

 	if exponent {
 		sc.readRune() // consume [eE]
 		c = sc.peekRune()
 		if c == '+' || c == '-' {
 			sc.readRune()
 			c = sc.peekRune()
 			if !isdigit(c) {
 				sc.error(sc.pos, "invalid float literal")
 			}
 		}
 		for isdigit(c) {
 			sc.readRune()
 			c = sc.peekRune()
 		}
 	}

 	sc.endToken(val)
 	if fraction || exponent {
 		var err error
 		val.float, err = strconv.ParseFloat(val.raw, 64)
 		if err != nil {
 			sc.error(sc.pos, "invalid float literal")
 		}
 		return FLOAT
 	} else {
 		var err error
 		s := val.raw
 		val.bigInt = nil
 		if len(s) > 2 && s[0] == '0' && (s[1] == 'o' || s[1] == 'O') {
 			val.int, err = strconv.ParseInt(s[2:], 8, 64)
 		} else if len(s) > 2 && s[0] == '0' && (s[1] == 'b' || s[1] == 'B') {
 			val.int, err = strconv.ParseInt(s[2:], 2, 64)
 		} else {
 			val.int, err = strconv.ParseInt(s, 0, 64)
 			if err != nil {
 				num := new(big.Int)
 				var ok bool
 				val.bigInt, ok = num.SetString(s, 0)
 				if ok {
 					err = nil
 				}
 			}
 		}
 		if err != nil {
 			sc.error(start, "invalid int literal")
 		}
 		return INT
 	}
 }

 // isIdent reports whether c is an identifier rune.
 func isIdent(c rune) bool {
 	return isdigit(c) || isIdentStart(c)
 }

 func isIdentStart(c rune) bool {
 	return 'a' <= c && c <= 'z' ||
 		'A' <= c && c <= 'Z' ||
 		c == '_' ||
 		unicode.IsLetter(c)
 }

 func isdigit(c rune) bool  { return '0' <= c && c <= '9' }
 func isodigit(c rune) bool { return '0' <= c && c <= '7' }
 func isxdigit(c rune) bool { return isdigit(c) || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' }
 func isbdigit(c rune) bool { return '0' == c || c == '1' }

 // keywordToken records the special tokens for
 // strings that should not be treated as ordinary identifiers.
 var keywordToken = map[string]Token{
 	"and":      AND,
 	"break":    BREAK,
 	"continue": CONTINUE,
 	"def":      DEF,
 	"elif":     ELIF,
 	"else":     ELSE,
 	"for":      FOR,
 	"if":       IF,
 	"in":       IN,
 	"lambda":   LAMBDA,
 	"load":     LOAD,
 	"not":      NOT,
 	"or":       OR,
 	"pass":     PASS,
 	"return":   RETURN,
 	"while":    WHILE,

 	// reserved words:
 	"as": ILLEGAL,
 	// "assert":   ILLEGAL, // heavily used by our tests
 	"class":    ILLEGAL,
 	"del":      ILLEGAL,
 	"except":   ILLEGAL,
 	"finally":  ILLEGAL,
 	"from":     ILLEGAL,
 	"global":   ILLEGAL,
 	"import":   ILLEGAL,
 	"is":       ILLEGAL,
 	"nonlocal": ILLEGAL,
 	"raise":    ILLEGAL,
 	"try":      ILLEGAL,
 	"with":     ILLEGAL,
 	"yield":    ILLEGAL,
 }