| #!/usr/bin/ruby |
| # encoding: utf-8 |
| |
| =begin LICENSE |
| |
| [The "BSD licence"] |
| Copyright (c) 2009-2010 Kyle Yetter |
| All rights reserved. |
| |
| Redistribution and use in source and binary forms, with or without |
| modification, are permitted provided that the following conditions |
| are met: |
| |
| 1. Redistributions of source code must retain the above copyright |
| notice, this list of conditions and the following disclaimer. |
| 2. Redistributions in binary form must reproduce the above copyright |
| notice, this list of conditions and the following disclaimer in the |
| documentation and/or other materials provided with the distribution. |
| 3. The name of the author may not be used to endorse or promote products |
| derived from this software without specific prior written permission. |
| |
| THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
| IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
| OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
| IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, |
| INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
| NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
| THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| =end |
| |
| module ANTLR3 |
| |
| =begin rdoc ANTLR3::Token |
| |
| At a minimum, tokens are data structures that bind together a chunk of text and |
| a corresponding type symbol, which categorizes/characterizes the content of the |
| text. Tokens also usually carry information about their location in the input, |
| such as absolute character index, line number, and position within the line (or |
| column). |
| |
| Furthermore, ANTLR tokens are assigned a "channel" number, an extra degree of |
| categorization that groups things on a larger scale. Parsers will usually ignore |
| tokens that have channel value 99 (the HIDDEN_CHANNEL), so you can keep things |
| like comment and white space huddled together with neighboring tokens, |
| effectively ignoring them without discarding them. |
| |
| ANTLR tokens also keep a reference to the source stream from which they |
| originated. Token streams will also provide an index value for the token, which |
| indicates the position of the token relative to other tokens in the stream, |
| starting at zero. For example, the 22nd token pulled from a lexer by |
| CommonTokenStream will have index value 21. |
| |
| == Token as an Interface |
| |
| This library provides a token implementation (see CommonToken). Additionally, |
| you may write your own token class as long as you provide methods that give |
| access to the attributes expected by a token. Even though most of the ANTLR |
| library tries to use duck-typing techniques instead of pure object-oriented type |
| checking, it's a good idea to include this ANTLR3::Token into your customized |
| token class. |
| |
| =end |
| |
| module Token |
| include ANTLR3::Constants |
| include Comparable |
| |
| # the token's associated chunk of text |
| attr_accessor :text |
| |
| # the integer value associated with the token's type |
| attr_accessor :type |
| |
| # the text's starting line number within the source (indexed starting at 1) |
| attr_accessor :line |
| |
| # the text's starting position in the line within the source (indexed starting at 0) |
| attr_accessor :column |
| |
| # the integer value of the channel to which the token is assigned |
| attr_accessor :channel |
| |
| # the index of the token with respect to other the other tokens produced during lexing |
| attr_accessor :index |
| |
| # a reference to the input stream from which the token was extracted |
| attr_accessor :input |
| |
| # the absolute character index in the input at which the text starts |
| attr_accessor :start |
| |
| # the absolute character index in the input at which the text ends |
| attr_accessor :stop |
| |
| alias :input_stream :input |
| alias :input_stream= :input= |
| alias :token_index :index |
| alias :token_index= :index= |
| |
| # |
| # The match operator has been implemented to match against several different |
| # attributes of a token for convenience in quick scripts |
| # |
| # @example Match against an integer token type constant |
| # token =~ VARIABLE_NAME => true/false |
| # @example Match against a token type name as a Symbol |
| # token =~ :FLOAT => true/false |
| # @example Match the token text against a Regular Expression |
| # token =~ /^@[a-z_]\w*$/i |
| # @example Compare the token's text to a string |
| # token =~ "class" |
| # |
| def =~ obj |
| case obj |
| when Integer then type == obj |
| when Symbol then name == obj.to_s |
| when Regexp then obj =~ text |
| when String then text == obj |
| else super |
| end |
| end |
| |
| # |
| # Tokens are comparable by their stream index values |
| # |
| def <=> tk2 |
| index <=> tk2.index |
| end |
| |
| def initialize_copy( orig ) |
| self.index = -1 |
| self.type = orig.type |
| self.channel = orig.channel |
| self.text = orig.text.clone if orig.text |
| self.start = orig.start |
| self.stop = orig.stop |
| self.line = orig.line |
| self.column = orig.column |
| self.input = orig.input |
| end |
| |
| def concrete? |
| input && start && stop ? true : false |
| end |
| |
| def imaginary? |
| input && start && stop ? false : true |
| end |
| |
| def name |
| token_name( type ) |
| end |
| |
| def source_name |
| i = input and i.source_name |
| end |
| |
| def hidden? |
| channel == HIDDEN_CHANNEL |
| end |
| |
| def source_text |
| concrete? ? input.substring( start, stop ) : text |
| end |
| |
| # |
| # Sets the token's channel value to HIDDEN_CHANNEL |
| # |
| def hide! |
| self.channel = HIDDEN_CHANNEL |
| end |
| |
| def inspect |
| text_inspect = text ? "[#{ text.inspect }] " : ' ' |
| text_position = line > 0 ? "@ line #{ line } col #{ column } " : '' |
| stream_position = start ? "(#{ range.inspect })" : '' |
| |
| front = index >= 0 ? "#{ index } " : '' |
| rep = front << name << text_inspect << |
| text_position << stream_position |
| rep.strip! |
| channel == DEFAULT_CHANNEL or rep << " (#{ channel.to_s })" |
| return( rep ) |
| end |
| |
| def pretty_print( printer ) |
| printer.text( inspect ) |
| end |
| |
| def range |
| start..stop rescue nil |
| end |
| |
| def to_i |
| index.to_i |
| end |
| |
| def to_s |
| text.to_s |
| end |
| |
| private |
| |
| def token_name( type ) |
| BUILT_IN_TOKEN_NAMES[ type ] |
| end |
| end |
| |
| CommonToken = Struct.new( :type, :channel, :text, :input, :start, |
| :stop, :index, :line, :column ) |
| |
| =begin rdoc ANTLR3::CommonToken |
| |
| The base class for the standard implementation of Token. It is implemented as a |
| simple Struct as tokens are basically simple data structures binding together a |
| bunch of different information and Structs are slightly faster than a standard |
| Object with accessor methods implementation. |
| |
| By default, ANTLR generated ruby code will provide a customized subclass of |
| CommonToken to track token-type names efficiently for debugging, inspection, and |
| general utility. Thus code generated for a standard combo lexer-parser grammar |
| named XYZ will have a base module named XYZ and a customized CommonToken |
| subclass named XYZ::Token. |
| |
| Here is the token structure attribute list in order: |
| |
| * <tt>type</tt> |
| * <tt>channel</tt> |
| * <tt>text</tt> |
| * <tt>input</tt> |
| * <tt>start</tt> |
| * <tt>stop</tt> |
| * <tt>index</tt> |
| * <tt>line</tt> |
| * <tt>column</tt> |
| |
| =end |
| |
| class CommonToken |
| include Token |
| DEFAULT_VALUES = { |
| :channel => DEFAULT_CHANNEL, |
| :index => -1, |
| :line => 0, |
| :column => -1 |
| }.freeze |
| |
| def self.token_name( type ) |
| BUILT_IN_TOKEN_NAMES[ type ] |
| end |
| |
| def self.create( fields = {} ) |
| fields = DEFAULT_VALUES.merge( fields ) |
| args = members.map { |name| fields[ name.to_sym ] } |
| new( *args ) |
| end |
| |
| # allows you to make a copy of a token with a different class |
| def self.from_token( token ) |
| new( |
| token.type, token.channel, token.text ? token.text.clone : nil, |
| token.input, token.start, token.stop, -1, token.line, token.column |
| ) |
| end |
| |
| def initialize( type = nil, channel = DEFAULT_CHANNEL, text = nil, |
| input = nil, start = nil, stop = nil, index = -1, |
| line = 0, column = -1 ) |
| super |
| block_given? and yield( self ) |
| self.text.nil? && self.start && self.stop and |
| self.text = self.input.substring( self.start, self.stop ) |
| end |
| |
| alias :input_stream :input |
| alias :input_stream= :input= |
| alias :token_index :index |
| alias :token_index= :index= |
| end |
| |
| module Constants |
| |
| # End of File / End of Input character and token type |
| EOF_TOKEN = CommonToken.new( EOF ).freeze |
| INVALID_TOKEN = CommonToken.new( INVALID_TOKEN_TYPE ).freeze |
| SKIP_TOKEN = CommonToken.new( INVALID_TOKEN_TYPE ).freeze |
| end |
| |
| |
| |
| =begin rdoc ANTLR3::TokenSource |
| |
| TokenSource is a simple mixin module that demands an |
| implementation of the method #next_token. In return, it |
| defines methods #next and #each, which provide basic |
| iterator methods for token generators. Furthermore, it |
| includes Enumerable to provide the standard Ruby iteration |
| methods to token generators, like lexers. |
| |
| =end |
| |
| module TokenSource |
| include Constants |
| include Enumerable |
| extend ClassMacros |
| |
| abstract :next_token |
| |
| def next |
| token = next_token() |
| raise StopIteration if token.nil? || token.type == EOF |
| return token |
| end |
| |
| def each |
| block_given? or return enum_for( :each ) |
| while token = next_token and token.type != EOF |
| yield( token ) |
| end |
| return self |
| end |
| |
| def to_stream( options = {} ) |
| if block_given? |
| CommonTokenStream.new( self, options ) { | t, stream | yield( t, stream ) } |
| else |
| CommonTokenStream.new( self, options ) |
| end |
| end |
| end |
| |
| |
| =begin rdoc ANTLR3::TokenFactory |
| |
| There are a variety of different entities throughout the ANTLR runtime library |
| that need to create token objects This module serves as a mixin that provides |
| methods for constructing tokens. |
| |
| Including this module provides a +token_class+ attribute. Instance of the |
| including class can create tokens using the token class (which defaults to |
| ANTLR3::CommonToken). Token classes are presumed to have an #initialize method |
| that can be called without any parameters and the token objects are expected to |
| have the standard token attributes (see ANTLR3::Token). |
| |
| =end |
| |
| module TokenFactory |
| attr_writer :token_class |
| def token_class |
| @token_class ||= begin |
| self.class.token_class rescue |
| self::Token rescue |
| ANTLR3::CommonToken |
| end |
| end |
| |
| def create_token( *args ) |
| if block_given? |
| token_class.new( *args ) do |*targs| |
| yield( *targs ) |
| end |
| else |
| token_class.new( *args ) |
| end |
| end |
| end |
| |
| |
| =begin rdoc ANTLR3::TokenScheme |
| |
| TokenSchemes exist to handle the problem of defining token types as integer |
| values while maintaining meaningful text names for the types. They are |
| dynamically defined modules that map integer values to constants with token-type |
| names. |
| |
| --- |
| |
| Fundamentally, tokens exist to take a chunk of text and identify it as belonging |
| to some category, like "VARIABLE" or "INTEGER". In code, the category is |
| represented by an integer -- some arbitrary value that ANTLR will decide to use |
| as it is creating the recognizer. The purpose of using an integer (instead of |
| say, a ruby symbol) is that ANTLR's decision logic often needs to test whether a |
| token's type falls within a range, which is not possible with symbols. |
| |
| The downside of token types being represented as integers is that a developer |
| needs to be able to reference the unknown type value by name in action code. |
| Furthermore, code that references the type by name and tokens that can be |
| inspected with names in place of type values are more meaningful to a developer. |
| |
| Since ANTLR requires token type names to follow capital-letter naming |
| conventions, defining types as named constants of the recognizer class resolves |
| the problem of referencing type values by name. Thus, a token type like |
| ``VARIABLE'' can be represented by a number like 5 and referenced within code by |
| +VARIABLE+. However, when a recognizer creates tokens, the name of the token's |
| type cannot be seen without using the data defined in the recognizer. |
| |
| Of course, tokens could be defined with a name attribute that could be specified |
| when tokens are created. However, doing so would make tokens take up more space |
| than necessary, as well as making it difficult to change the type of a token |
| while maintaining a correct name value. |
| |
| TokenSchemes exist as a technique to manage token type referencing and name |
| extraction. They: |
| |
| 1. keep token type references clear and understandable in recognizer code |
| 2. permit access to a token's type-name independently of recognizer objects |
| 3. allow multiple classes to share the same token information |
| |
| == Building Token Schemes |
| |
| TokenScheme is a subclass of Module. Thus, it has the method |
| <tt>TokenScheme.new(tk_class = nil) { ... module-level code ...}</tt>, which |
| will evaluate the block in the context of the scheme (module), similarly to |
| Module#module_eval. Before evaluating the block, <tt>.new</tt> will setup the |
| module with the following actions: |
| |
| 1. define a customized token class (more on that below) |
| 2. add a new constant, TOKEN_NAMES, which is a hash that maps types to names |
| 3. dynamically populate the new scheme module with a couple instance methods |
| 4. include ANTLR3::Constants in the new scheme module |
| |
| As TokenScheme the class functions as a metaclass, figuring out some of the |
| scoping behavior can be mildly confusing if you're trying to get a handle of the |
| entity for your own purposes. Remember that all of the instance methods of |
| TokenScheme function as module-level methods of TokenScheme instances, ala |
| +attr_accessor+ and friends. |
| |
| <tt>TokenScheme#define_token(name_symbol, int_value)</tt> adds a constant |
| definition <tt>name_symbol</tt> with the value <tt>int_value</tt>. It is |
| essentially like <tt>Module#const_set</tt>, except it forbids constant |
| overwriting (which would mess up recognizer code fairly badly) and adds an |
| inverse type-to-name map to its own <tt>TOKEN_NAMES</tt> table. |
| <tt>TokenScheme#define_tokens</tt> is a convenience method for defining many |
| types with a hash pairing names to values. |
| |
| <tt>TokenScheme#register_name(value, name_string)</tt> specifies a custom |
| type-to-name definition. This is particularly useful for the anonymous tokens |
| that ANTLR generates for literal strings in the grammar specification. For |
| example, if you refer to the literal <tt>'='</tt> in some parser rule in your |
| grammar, ANTLR will add a lexer rule for the literal and give the token a name |
| like <tt>T__<i>x</i></tt>, where <tt><i>x</i></tt> is the type's integer value. |
| Since this is pretty meaningless to a developer, generated code should add a |
| special name definition for type value <tt><i>x</i></tt> with the string |
| <tt>"'='"</tt>. |
| |
| === Sample TokenScheme Construction |
| |
| TokenData = ANTLR3::TokenScheme.new do |
| define_tokens( |
| :INT => 4, |
| :ID => 6, |
| :T__5 => 5, |
| :WS => 7 |
| ) |
| |
| # note the self:: scoping below is due to the fact that |
| # ruby lexically-scopes constant names instead of |
| # looking up in the current scope |
| register_name(self::T__5, "'='") |
| end |
| |
| TokenData::ID # => 6 |
| TokenData::T__5 # => 5 |
| TokenData.token_name(4) # => 'INT' |
| TokenData.token_name(5) # => "'='" |
| |
| class ARecognizerOrSuch < ANTLR3::Parser |
| include TokenData |
| ID # => 6 |
| end |
| |
| == Custom Token Classes and Relationship with Tokens |
| |
| When a TokenScheme is created, it will define a subclass of ANTLR3::CommonToken |
| and assigned it to the constant name +Token+. This token class will both include |
| and extend the scheme module. Since token schemes define the private instance |
| method <tt>token_name(type)</tt>, instances of the token class are now able to |
| provide their type names. The Token method <tt>name</tt> uses the |
| <tt>token_name</tt> method to provide the type name as if it were a simple |
| attribute without storing the name itself. |
| |
| When a TokenScheme is included in a recognizer class, the class will now have |
| the token types as named constants, a type-to-name map constant +TOKEN_NAMES+, |
| and a grammar-specific subclass of ANTLR3::CommonToken assigned to the constant |
| Token. Thus, when recognizers need to manufacture tokens, instead of using the |
| generic CommonToken class, they can create tokens using the customized Token |
| class provided by the token scheme. |
| |
| If you need to use a token class other than CommonToken, you can pass the class |
| as a parameter to TokenScheme.new, which will be used in place of the |
| dynamically-created CommonToken subclass. |
| |
| =end |
| |
| class TokenScheme < ::Module |
| include TokenFactory |
| |
| def self.new( tk_class = nil, &body ) |
| super() do |
| tk_class ||= Class.new( ::ANTLR3::CommonToken ) |
| self.token_class = tk_class |
| |
| const_set( :TOKEN_NAMES, ::ANTLR3::Constants::BUILT_IN_TOKEN_NAMES.clone ) |
| |
| @types = ::ANTLR3::Constants::BUILT_IN_TOKEN_NAMES.invert |
| @unused = ::ANTLR3::Constants::MIN_TOKEN_TYPE |
| |
| scheme = self |
| define_method( :token_scheme ) { scheme } |
| define_method( :token_names ) { scheme::TOKEN_NAMES } |
| define_method( :token_name ) do |type| |
| begin |
| token_names[ type ] or super |
| rescue NoMethodError |
| ::ANTLR3::CommonToken.token_name( type ) |
| end |
| end |
| module_function :token_name, :token_names |
| |
| include ANTLR3::Constants |
| |
| body and module_eval( &body ) |
| end |
| end |
| |
| def self.build( *token_names ) |
| token_names = [ token_names ].flatten! |
| token_names.compact! |
| token_names.uniq! |
| tk_class = Class === token_names.first ? token_names.shift : nil |
| value_maps, names = token_names.partition { |i| Hash === i } |
| new( tk_class ) do |
| for value_map in value_maps |
| define_tokens( value_map ) |
| end |
| |
| for name in names |
| define_token( name ) |
| end |
| end |
| end |
| |
| |
| def included( mod ) |
| super |
| mod.extend( self ) |
| end |
| private :included |
| |
| attr_reader :unused, :types |
| |
| def define_tokens( token_map = {} ) |
| for token_name, token_value in token_map |
| define_token( token_name, token_value ) |
| end |
| return self |
| end |
| |
| def define_token( name, value = nil ) |
| name = name.to_s |
| |
| if current_value = @types[ name ] |
| # token type has already been defined |
| # raise an error unless value is the same as the current value |
| value ||= current_value |
| unless current_value == value |
| raise NameError.new( |
| "new token type definition ``#{ name } = #{ value }'' conflicts " << |
| "with existing type definition ``#{ name } = #{ current_value }''", name |
| ) |
| end |
| else |
| value ||= @unused |
| if name =~ /^[A-Z]\w*$/ |
| const_set( name, @types[ name ] = value ) |
| else |
| constant = "T__#{ value }" |
| const_set( constant, @types[ constant ] = value ) |
| @types[ name ] = value |
| end |
| register_name( value, name ) unless built_in_type?( value ) |
| end |
| |
| value >= @unused and @unused = value + 1 |
| return self |
| end |
| |
| def register_names( *names ) |
| if names.length == 1 and Hash === names.first |
| names.first.each do |value, name| |
| register_name( value, name ) |
| end |
| else |
| names.each_with_index do |name, i| |
| type_value = Constants::MIN_TOKEN_TYPE + i |
| register_name( type_value, name ) |
| end |
| end |
| end |
| |
| def register_name( type_value, name ) |
| name = name.to_s.freeze |
| if token_names.has_key?( type_value ) |
| current_name = token_names[ type_value ] |
| current_name == name and return name |
| |
| if current_name == "T__#{ type_value }" |
| # only an anonymous name is registered -- upgrade the name to the full literal name |
| token_names[ type_value ] = name |
| elsif name == "T__#{ type_value }" |
| # ignore name downgrade from literal to anonymous constant |
| return current_name |
| else |
| error = NameError.new( |
| "attempted assignment of token type #{ type_value }" << |
| " to name #{ name } conflicts with existing name #{ current_name }", name |
| ) |
| raise error |
| end |
| else |
| token_names[ type_value ] = name.to_s.freeze |
| end |
| end |
| |
| def built_in_type?( type_value ) |
| Constants::BUILT_IN_TOKEN_NAMES.fetch( type_value, false ) and true |
| end |
| |
| def token_defined?( name_or_value ) |
| case value |
| when Integer then token_names.has_key?( name_or_value ) |
| else const_defined?( name_or_value.to_s ) |
| end |
| end |
| |
| def []( name_or_value ) |
| case name_or_value |
| when Integer then token_names.fetch( name_or_value, nil ) |
| else const_get( name_or_value.to_s ) rescue token_names.index( name_or_value ) |
| end |
| end |
| |
| def token_class |
| self::Token |
| end |
| |
| def token_class=( klass ) |
| Class === klass or raise( TypeError, "token_class must be a Class" ) |
| Util.silence_warnings do |
| klass < self or klass.send( :include, self ) |
| const_set( :Token, klass ) |
| end |
| end |
| |
| end |
| |
| end |