mojo/public/tools/bindings/pylib/mojom/parse/lexer.py - platform/external/libmojo - Git at Google

 # Copyright 2014 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 import imp
 import os.path
 import sys

 def _GetDirAbove(dirname):
   """Returns the directory "above" this file containing |dirname| (which must
   also be "above" this file)."""
   path = os.path.abspath(__file__)
   while True:
     path, tail = os.path.split(path)
     assert tail
     if tail == dirname:
       return path

 try:
   imp.find_module("ply")
 except ImportError:
   sys.path.append(os.path.join(_GetDirAbove("mojo"), "third_party"))
 from ply.lex import TOKEN

 from ..error import Error


 class LexError(Error):
   """Class for errors from the lexer."""

   def __init__(self, filename, message, lineno):
     Error.__init__(self, filename, message, lineno=lineno)


 # We have methods which look like they could be functions:
 # pylint: disable=R0201
 class Lexer(object):

   def __init__(self, filename):
     self.filename = filename

   ######################--   PRIVATE   --######################

   ##
   ## Internal auxiliary methods
   ##
   def _error(self, msg, token):
     raise LexError(self.filename, msg, token.lineno)

   ##
   ## Reserved keywords
   ##
   keywords = (
     'HANDLE',

     'IMPORT',
     'MODULE',
     'STRUCT',
     'UNION',
     'INTERFACE',
     'ENUM',
     'CONST',
     'TRUE',
     'FALSE',
     'DEFAULT',
     'ARRAY',
     'MAP',
     'ASSOCIATED'
   )

   keyword_map = {}
   for keyword in keywords:
     keyword_map[keyword.lower()] = keyword

   ##
   ## All the tokens recognized by the lexer
   ##
   tokens = keywords + (
     # Identifiers
     'NAME',

     # Constants
     'ORDINAL',
     'INT_CONST_DEC', 'INT_CONST_HEX',
     'FLOAT_CONST',

     # String literals
     'STRING_LITERAL',

     # Operators
     'MINUS',
     'PLUS',
     'AMP',
     'QSTN',

     # Assignment
     'EQUALS',

     # Request / response
     'RESPONSE',

     # Delimiters
     'LPAREN', 'RPAREN',         # ( )
     'LBRACKET', 'RBRACKET',     # [ ]
     'LBRACE', 'RBRACE',         # { }
     'LANGLE', 'RANGLE',         # < >
     'SEMI',                     # ;
     'COMMA', 'DOT'              # , .
   )

   ##
   ## Regexes for use in tokens
   ##

   # valid C identifiers (K&R2: A.2.3)
   identifier = r'[a-zA-Z_][0-9a-zA-Z_]*'

   hex_prefix = '0[xX]'
   hex_digits = '[0-9a-fA-F]+'

   # integer constants (K&R2: A.2.5.1)
   decimal_constant = '0|([1-9][0-9]*)'
   hex_constant = hex_prefix+hex_digits
   # Don't allow octal constants (even invalid octal).
   octal_constant_disallowed = '0[0-9]+'

   # character constants (K&R2: A.2.5.2)
   # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
   # directives with Windows paths as filenames (..\..\dir\file)
   # For the same reason, decimal_escape allows all digit sequences. We want to
   # parse all correct code, even if it means to sometimes parse incorrect
   # code.
   #
   simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
   decimal_escape = r"""(\d+)"""
   hex_escape = r"""(x[0-9a-fA-F]+)"""
   bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""

   escape_sequence = \
       r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))'

   # string literals (K&R2: A.2.6)
   string_char = r"""([^"\\\n]|"""+escape_sequence+')'
   string_literal = '"'+string_char+'*"'
   bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"'

   # floating constants (K&R2: A.2.5.3)
   exponent_part = r"""([eE][-+]?[0-9]+)"""
   fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
   floating_constant = \
       '(((('+fractional_constant+')'+ \
       exponent_part+'?)|([0-9]+'+exponent_part+')))'

   # Ordinals
   ordinal = r'@[0-9]+'
   missing_ordinal_value = r'@'
   # Don't allow ordinal values in octal (even invalid octal, like 09) or
   # hexadecimal.
   octal_or_hex_ordinal_disallowed = r'@((0[0-9]+)|('+hex_prefix+hex_digits+'))'

   ##
   ## Rules for the normal state
   ##
   t_ignore = ' \t\r'

   # Newlines
   def t_NEWLINE(self, t):
     r'\n+'
     t.lexer.lineno += len(t.value)

   # Operators
   t_MINUS             = r'-'
   t_PLUS              = r'\+'
   t_AMP               = r'&'
   t_QSTN              = r'\?'

   # =
   t_EQUALS            = r'='

   # =>
   t_RESPONSE          = r'=>'

   # Delimiters
   t_LPAREN            = r'\('
   t_RPAREN            = r'\)'
   t_LBRACKET          = r'\['
   t_RBRACKET          = r'\]'
   t_LBRACE            = r'\{'
   t_RBRACE            = r'\}'
   t_LANGLE            = r'<'
   t_RANGLE            = r'>'
   t_COMMA             = r','
   t_DOT               = r'\.'
   t_SEMI              = r';'

   t_STRING_LITERAL    = string_literal

   # The following floating and integer constants are defined as
   # functions to impose a strict order (otherwise, decimal
   # is placed before the others because its regex is longer,
   # and this is bad)
   #
   @TOKEN(floating_constant)
   def t_FLOAT_CONST(self, t):
     return t

   @TOKEN(hex_constant)
   def t_INT_CONST_HEX(self, t):
     return t

   @TOKEN(octal_constant_disallowed)
   def t_OCTAL_CONSTANT_DISALLOWED(self, t):
     msg = "Octal values not allowed"
     self._error(msg, t)

   @TOKEN(decimal_constant)
   def t_INT_CONST_DEC(self, t):
     return t

   # unmatched string literals are caught by the preprocessor

   @TOKEN(bad_string_literal)
   def t_BAD_STRING_LITERAL(self, t):
     msg = "String contains invalid escape code"
     self._error(msg, t)

   # Handle ordinal-related tokens in the right order:
   @TOKEN(octal_or_hex_ordinal_disallowed)
   def t_OCTAL_OR_HEX_ORDINAL_DISALLOWED(self, t):
     msg = "Octal and hexadecimal ordinal values not allowed"
     self._error(msg, t)

   @TOKEN(ordinal)
   def t_ORDINAL(self, t):
     return t

   @TOKEN(missing_ordinal_value)
   def t_BAD_ORDINAL(self, t):
     msg = "Missing ordinal value"
     self._error(msg, t)

   @TOKEN(identifier)
   def t_NAME(self, t):
     t.type = self.keyword_map.get(t.value, "NAME")
     return t

   # Ignore C and C++ style comments
   def t_COMMENT(self, t):
     r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
     t.lexer.lineno += t.value.count("\n")

   def t_error(self, t):
     msg = "Illegal character %s" % repr(t.value[0])
     self._error(msg, t)
	# Copyright 2014 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	import imp
	import os.path
	import sys

	def _GetDirAbove(dirname):
	"""Returns the directory "above" this file containing \|dirname\| (which must
	also be "above" this file)."""
	path = os.path.abspath(__file__)
	while True:
	path, tail = os.path.split(path)
	assert tail
	if tail == dirname:
	return path

	try:
	imp.find_module("ply")
	except ImportError:
	sys.path.append(os.path.join(_GetDirAbove("mojo"), "third_party"))
	from ply.lex import TOKEN

	from ..error import Error


	class LexError(Error):
	"""Class for errors from the lexer."""

	def __init__(self, filename, message, lineno):
	Error.__init__(self, filename, message, lineno=lineno)


	# We have methods which look like they could be functions:
	# pylint: disable=R0201
	class Lexer(object):

	def __init__(self, filename):
	self.filename = filename

	######################-- PRIVATE --######################

	##
	## Internal auxiliary methods
	##
	def _error(self, msg, token):
	raise LexError(self.filename, msg, token.lineno)

	##
	## Reserved keywords
	##
	keywords = (
	'HANDLE',

	'IMPORT',
	'MODULE',
	'STRUCT',
	'UNION',
	'INTERFACE',
	'ENUM',
	'CONST',
	'TRUE',
	'FALSE',
	'DEFAULT',
	'ARRAY',
	'MAP',
	'ASSOCIATED'
	)

	keyword_map = {}
	for keyword in keywords:
	keyword_map[keyword.lower()] = keyword

	##
	## All the tokens recognized by the lexer
	##
	tokens = keywords + (
	# Identifiers
	'NAME',

	# Constants
	'ORDINAL',
	'INT_CONST_DEC', 'INT_CONST_HEX',
	'FLOAT_CONST',

	# String literals
	'STRING_LITERAL',

	# Operators
	'MINUS',
	'PLUS',
	'AMP',
	'QSTN',

	# Assignment
	'EQUALS',

	# Request / response
	'RESPONSE',

	# Delimiters
	'LPAREN', 'RPAREN', # ( )
	'LBRACKET', 'RBRACKET', # [ ]
	'LBRACE', 'RBRACE', # { }
	'LANGLE', 'RANGLE', # < >
	'SEMI', # ;
	'COMMA', 'DOT' # , .
	)

	##
	## Regexes for use in tokens
	##

	# valid C identifiers (K&R2: A.2.3)
	identifier = r'[a-zA-Z_][0-9a-zA-Z_]*'

	hex_prefix = '0[xX]'
	hex_digits = '[0-9a-fA-F]+'

	# integer constants (K&R2: A.2.5.1)
	decimal_constant = '0\|([1-9][0-9]*)'
	hex_constant = hex_prefix+hex_digits
	# Don't allow octal constants (even invalid octal).
	octal_constant_disallowed = '0[0-9]+'

	# character constants (K&R2: A.2.5.2)
	# Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
	# directives with Windows paths as filenames (..\..\dir\file)
	# For the same reason, decimal_escape allows all digit sequences. We want to
	# parse all correct code, even if it means to sometimes parse incorrect
	# code.
	#
	simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
	decimal_escape = r"""(\d+)"""
	hex_escape = r"""(x[0-9a-fA-F]+)"""
	bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""

	escape_sequence = \
	r"""(\\("""+simple_escape+'\|'+decimal_escape+'\|'+hex_escape+'))'

	# string literals (K&R2: A.2.6)
	string_char = r"""([^"\\\n]\|"""+escape_sequence+')'
	string_literal = '"'+string_char+'*"'
	bad_string_literal = '"'+string_char+''+bad_escape+string_char+'"'

	# floating constants (K&R2: A.2.5.3)
	exponent_part = r"""([eE][-+]?[0-9]+)"""
	fractional_constant = r"""([0-9]*\.[0-9]+)\|([0-9]+\.)"""
	floating_constant = \
	'(((('+fractional_constant+')'+ \
	exponent_part+'?)\|([0-9]+'+exponent_part+')))'

	# Ordinals
	ordinal = r'@[0-9]+'
	missing_ordinal_value = r'@'
	# Don't allow ordinal values in octal (even invalid octal, like 09) or
	# hexadecimal.
	octal_or_hex_ordinal_disallowed = r'@((0[0-9]+)\|('+hex_prefix+hex_digits+'))'

	##
	## Rules for the normal state
	##
	t_ignore = ' \t\r'

	# Newlines
	def t_NEWLINE(self, t):
	r'\n+'
	t.lexer.lineno += len(t.value)

	# Operators
	t_MINUS = r'-'
	t_PLUS = r'\+'
	t_AMP = r'&'
	t_QSTN = r'\?'

	# =
	t_EQUALS = r'='

	# =>
	t_RESPONSE = r'=>'

	# Delimiters
	t_LPAREN = r'\('
	t_RPAREN = r'\)'
	t_LBRACKET = r'\['
	t_RBRACKET = r'\]'
	t_LBRACE = r'\{'
	t_RBRACE = r'\}'
	t_LANGLE = r'<'
	t_RANGLE = r'>'
	t_COMMA = r','
	t_DOT = r'\.'
	t_SEMI = r';'

	t_STRING_LITERAL = string_literal

	# The following floating and integer constants are defined as
	# functions to impose a strict order (otherwise, decimal
	# is placed before the others because its regex is longer,
	# and this is bad)
	#
	@TOKEN(floating_constant)
	def t_FLOAT_CONST(self, t):
	return t

	@TOKEN(hex_constant)
	def t_INT_CONST_HEX(self, t):
	return t

	@TOKEN(octal_constant_disallowed)
	def t_OCTAL_CONSTANT_DISALLOWED(self, t):
	msg = "Octal values not allowed"
	self._error(msg, t)

	@TOKEN(decimal_constant)
	def t_INT_CONST_DEC(self, t):
	return t

	# unmatched string literals are caught by the preprocessor

	@TOKEN(bad_string_literal)
	def t_BAD_STRING_LITERAL(self, t):
	msg = "String contains invalid escape code"
	self._error(msg, t)

	# Handle ordinal-related tokens in the right order:
	@TOKEN(octal_or_hex_ordinal_disallowed)
	def t_OCTAL_OR_HEX_ORDINAL_DISALLOWED(self, t):
	msg = "Octal and hexadecimal ordinal values not allowed"
	self._error(msg, t)

	@TOKEN(ordinal)
	def t_ORDINAL(self, t):
	return t

	@TOKEN(missing_ordinal_value)
	def t_BAD_ORDINAL(self, t):
	msg = "Missing ordinal value"
	self._error(msg, t)

	@TOKEN(identifier)
	def t_NAME(self, t):
	t.type = self.keyword_map.get(t.value, "NAME")
	return t

	# Ignore C and C++ style comments
	def t_COMMENT(self, t):
	r'(/\(.\|\n)?\/)\|(//.(\n[ \t]//.)*)'
	t.lexer.lineno += t.value.count("\n")

	def t_error(self, t):
	msg = "Illegal character %s" % repr(t.value[0])
	self._error(msg, t)