| package TAP::Parser::Grammar; |
| |
| use strict; |
| use vars qw($VERSION @ISA); |
| |
| use TAP::Object (); |
| use TAP::Parser::ResultFactory (); |
| use TAP::Parser::YAMLish::Reader (); |
| |
| @ISA = qw(TAP::Object); |
| |
| =head1 NAME |
| |
| TAP::Parser::Grammar - A grammar for the Test Anything Protocol. |
| |
| =head1 VERSION |
| |
| Version 3.23 |
| |
| =cut |
| |
| $VERSION = '3.23'; |
| |
| =head1 SYNOPSIS |
| |
| use TAP::Parser::Grammar; |
| my $grammar = $self->make_grammar({ |
| iterator => $tap_parser_iterator, |
| parser => $tap_parser, |
| version => 12, |
| }); |
| |
| my $result = $grammar->tokenize; |
| |
| =head1 DESCRIPTION |
| |
| C<TAP::Parser::Grammar> tokenizes lines from a L<TAP::Parser::Iterator> and |
| constructs L<TAP::Parser::Result> subclasses to represent the tokens. |
| |
| Do not attempt to use this class directly. It won't make sense. It's mainly |
| here to ensure that we will be able to have pluggable grammars when TAP is |
| expanded at some future date (plus, this stuff was really cluttering the |
| parser). |
| |
| =head1 METHODS |
| |
| =head2 Class Methods |
| |
| =head3 C<new> |
| |
| my $grammar = TAP::Parser::Grammar->new({ |
| iterator => $iterator, |
| parser => $parser, |
| version => $version, |
| }); |
| |
| Returns L<TAP::Parser> grammar object that will parse the TAP stream from the |
| specified iterator. Both C<iterator> and C<parser> are required arguments. |
| If C<version> is not set it defaults to C<12> (see L</set_version> for more |
| details). |
| |
| =cut |
| |
| # new() implementation supplied by TAP::Object |
| sub _initialize { |
| my ( $self, $args ) = @_; |
| $self->{iterator} = $args->{iterator}; # TODO: accessor |
| $self->{iterator} ||= $args->{stream}; # deprecated |
| $self->{parser} = $args->{parser}; # TODO: accessor |
| $self->set_version( $args->{version} || 12 ); |
| return $self; |
| } |
| |
| my %language_for; |
| |
| { |
| |
| # XXX the 'not' and 'ok' might be on separate lines in VMS ... |
| my $ok = qr/(?:not )?ok\b/; |
| my $num = qr/\d+/; |
| |
| my %v12 = ( |
| version => { |
| syntax => qr/^TAP\s+version\s+(\d+)\s*\z/i, |
| handler => sub { |
| my ( $self, $line ) = @_; |
| my $version = $1; |
| return $self->_make_version_token( $line, $version, ); |
| }, |
| }, |
| plan => { |
| syntax => qr/^1\.\.(\d+)\s*(.*)\z/, |
| handler => sub { |
| my ( $self, $line ) = @_; |
| my ( $tests_planned, $tail ) = ( $1, $2 ); |
| my $explanation = undef; |
| my $skip = ''; |
| |
| if ( $tail =~ /^todo((?:\s+\d+)+)/ ) { |
| my @todo = split /\s+/, _trim($1); |
| return $self->_make_plan_token( |
| $line, $tests_planned, 'TODO', |
| '', \@todo |
| ); |
| } |
| elsif ( 0 == $tests_planned ) { |
| $skip = 'SKIP'; |
| |
| # If we can't match # SKIP the directive should be undef. |
| ($explanation) = $tail =~ /^#\s*SKIP\S*\s+(.*)/i; |
| } |
| elsif ( $tail !~ /^\s*$/ ) { |
| return $self->_make_unknown_token($line); |
| } |
| |
| $explanation = '' unless defined $explanation; |
| |
| return $self->_make_plan_token( |
| $line, $tests_planned, $skip, |
| $explanation, [] |
| ); |
| |
| }, |
| }, |
| |
| # An optimization to handle the most common test lines without |
| # directives. |
| simple_test => { |
| syntax => qr/^($ok) \ ($num) (?:\ ([^#]+))? \z/x, |
| handler => sub { |
| my ( $self, $line ) = @_; |
| my ( $ok, $num, $desc ) = ( $1, $2, $3 ); |
| |
| return $self->_make_test_token( |
| $line, $ok, $num, |
| $desc |
| ); |
| }, |
| }, |
| test => { |
| syntax => qr/^($ok) \s* ($num)? \s* (.*) \z/x, |
| handler => sub { |
| my ( $self, $line ) = @_; |
| my ( $ok, $num, $desc ) = ( $1, $2, $3 ); |
| my ( $dir, $explanation ) = ( '', '' ); |
| if ($desc =~ m/^ ( [^\\\#]* (?: \\. [^\\\#]* )* ) |
| \# \s* (SKIP|TODO) \b \s* (.*) $/ix |
| ) |
| { |
| ( $desc, $dir, $explanation ) = ( $1, $2, $3 ); |
| } |
| return $self->_make_test_token( |
| $line, $ok, $num, $desc, |
| $dir, $explanation |
| ); |
| }, |
| }, |
| comment => { |
| syntax => qr/^#(.*)/, |
| handler => sub { |
| my ( $self, $line ) = @_; |
| my $comment = $1; |
| return $self->_make_comment_token( $line, $comment ); |
| }, |
| }, |
| bailout => { |
| syntax => qr/^\s*Bail out!\s*(.*)/, |
| handler => sub { |
| my ( $self, $line ) = @_; |
| my $explanation = $1; |
| return $self->_make_bailout_token( |
| $line, |
| $explanation |
| ); |
| }, |
| }, |
| ); |
| |
| my %v13 = ( |
| %v12, |
| plan => { |
| syntax => qr/^1\.\.(\d+)(?:\s*#\s*SKIP\b(.*))?\z/i, |
| handler => sub { |
| my ( $self, $line ) = @_; |
| my ( $tests_planned, $explanation ) = ( $1, $2 ); |
| my $skip |
| = ( 0 == $tests_planned || defined $explanation ) |
| ? 'SKIP' |
| : ''; |
| $explanation = '' unless defined $explanation; |
| return $self->_make_plan_token( |
| $line, $tests_planned, $skip, |
| $explanation, [] |
| ); |
| }, |
| }, |
| yaml => { |
| syntax => qr/^ (\s+) (---.*) $/x, |
| handler => sub { |
| my ( $self, $line ) = @_; |
| my ( $pad, $marker ) = ( $1, $2 ); |
| return $self->_make_yaml_token( $pad, $marker ); |
| }, |
| }, |
| pragma => { |
| syntax => |
| qr/^ pragma \s+ ( [-+] \w+ \s* (?: , \s* [-+] \w+ \s* )* ) $/x, |
| handler => sub { |
| my ( $self, $line ) = @_; |
| my $pragmas = $1; |
| return $self->_make_pragma_token( $line, $pragmas ); |
| }, |
| }, |
| ); |
| |
| %language_for = ( |
| '12' => { |
| tokens => \%v12, |
| }, |
| '13' => { |
| tokens => \%v13, |
| setup => sub { |
| shift->{iterator}->handle_unicode; |
| }, |
| }, |
| ); |
| } |
| |
| ############################################################################## |
| |
| =head2 Instance Methods |
| |
| =head3 C<set_version> |
| |
| $grammar->set_version(13); |
| |
| Tell the grammar which TAP syntax version to support. The lowest |
| supported version is 12. Although 'TAP version' isn't valid version 12 |
| syntax it is accepted so that higher version numbers may be parsed. |
| |
| =cut |
| |
| sub set_version { |
| my $self = shift; |
| my $version = shift; |
| |
| if ( my $language = $language_for{$version} ) { |
| $self->{version} = $version; |
| $self->{tokens} = $language->{tokens}; |
| |
| if ( my $setup = $language->{setup} ) { |
| $self->$setup(); |
| } |
| |
| $self->_order_tokens; |
| } |
| else { |
| require Carp; |
| Carp::croak("Unsupported syntax version: $version"); |
| } |
| } |
| |
| # Optimization to put the most frequent tokens first. |
| sub _order_tokens { |
| my $self = shift; |
| |
| my %copy = %{ $self->{tokens} }; |
| my @ordered_tokens = grep {defined} |
| map { delete $copy{$_} } qw( simple_test test comment plan ); |
| push @ordered_tokens, values %copy; |
| |
| $self->{ordered_tokens} = \@ordered_tokens; |
| } |
| |
| ############################################################################## |
| |
| =head3 C<tokenize> |
| |
| my $token = $grammar->tokenize; |
| |
| This method will return a L<TAP::Parser::Result> object representing the |
| current line of TAP. |
| |
| =cut |
| |
| sub tokenize { |
| my $self = shift; |
| |
| my $line = $self->{iterator}->next; |
| unless ( defined $line ) { |
| delete $self->{parser}; # break circular ref |
| return; |
| } |
| |
| my $token; |
| |
| for my $token_data ( @{ $self->{ordered_tokens} } ) { |
| if ( $line =~ $token_data->{syntax} ) { |
| my $handler = $token_data->{handler}; |
| $token = $self->$handler($line); |
| last; |
| } |
| } |
| |
| $token = $self->_make_unknown_token($line) unless $token; |
| |
| return $self->{parser}->make_result($token); |
| } |
| |
| ############################################################################## |
| |
| =head3 C<token_types> |
| |
| my @types = $grammar->token_types; |
| |
| Returns the different types of tokens which this grammar can parse. |
| |
| =cut |
| |
| sub token_types { |
| my $self = shift; |
| return keys %{ $self->{tokens} }; |
| } |
| |
| ############################################################################## |
| |
| =head3 C<syntax_for> |
| |
| my $syntax = $grammar->syntax_for($token_type); |
| |
| Returns a pre-compiled regular expression which will match a chunk of TAP |
| corresponding to the token type. For example (not that you should really pay |
| attention to this, C<< $grammar->syntax_for('comment') >> will return |
| C<< qr/^#(.*)/ >>. |
| |
| =cut |
| |
| sub syntax_for { |
| my ( $self, $type ) = @_; |
| return $self->{tokens}->{$type}->{syntax}; |
| } |
| |
| ############################################################################## |
| |
| =head3 C<handler_for> |
| |
| my $handler = $grammar->handler_for($token_type); |
| |
| Returns a code reference which, when passed an appropriate line of TAP, |
| returns the lexed token corresponding to that line. As a result, the basic |
| TAP parsing loop looks similar to the following: |
| |
| my @tokens; |
| my $grammar = TAP::Grammar->new; |
| LINE: while ( defined( my $line = $parser->_next_chunk_of_tap ) ) { |
| for my $type ( $grammar->token_types ) { |
| my $syntax = $grammar->syntax_for($type); |
| if ( $line =~ $syntax ) { |
| my $handler = $grammar->handler_for($type); |
| push @tokens => $grammar->$handler($line); |
| next LINE; |
| } |
| } |
| push @tokens => $grammar->_make_unknown_token($line); |
| } |
| |
| =cut |
| |
| sub handler_for { |
| my ( $self, $type ) = @_; |
| return $self->{tokens}->{$type}->{handler}; |
| } |
| |
| sub _make_version_token { |
| my ( $self, $line, $version ) = @_; |
| return { |
| type => 'version', |
| raw => $line, |
| version => $version, |
| }; |
| } |
| |
| sub _make_plan_token { |
| my ( $self, $line, $tests_planned, $directive, $explanation, $todo ) = @_; |
| |
| if ( $directive eq 'SKIP' |
| && 0 != $tests_planned |
| && $self->{version} < 13 ) |
| { |
| warn |
| "Specified SKIP directive in plan but more than 0 tests ($line)\n"; |
| } |
| |
| return { |
| type => 'plan', |
| raw => $line, |
| tests_planned => $tests_planned, |
| directive => $directive, |
| explanation => _trim($explanation), |
| todo_list => $todo, |
| }; |
| } |
| |
| sub _make_test_token { |
| my ( $self, $line, $ok, $num, $desc, $dir, $explanation ) = @_; |
| return { |
| ok => $ok, |
| test_num => $num, |
| description => _trim($desc), |
| directive => ( defined $dir ? uc $dir : '' ), |
| explanation => _trim($explanation), |
| raw => $line, |
| type => 'test', |
| }; |
| } |
| |
| sub _make_unknown_token { |
| my ( $self, $line ) = @_; |
| return { |
| raw => $line, |
| type => 'unknown', |
| }; |
| } |
| |
| sub _make_comment_token { |
| my ( $self, $line, $comment ) = @_; |
| return { |
| type => 'comment', |
| raw => $line, |
| comment => _trim($comment) |
| }; |
| } |
| |
| sub _make_bailout_token { |
| my ( $self, $line, $explanation ) = @_; |
| return { |
| type => 'bailout', |
| raw => $line, |
| bailout => _trim($explanation) |
| }; |
| } |
| |
| sub _make_yaml_token { |
| my ( $self, $pad, $marker ) = @_; |
| |
| my $yaml = TAP::Parser::YAMLish::Reader->new; |
| |
| my $iterator = $self->{iterator}; |
| |
| # Construct a reader that reads from our input stripping leading |
| # spaces from each line. |
| my $leader = length($pad); |
| my $strip = qr{ ^ (\s{$leader}) (.*) $ }x; |
| my @extra = ($marker); |
| my $reader = sub { |
| return shift @extra if @extra; |
| my $line = $iterator->next; |
| return $2 if $line =~ $strip; |
| return; |
| }; |
| |
| my $data = $yaml->read($reader); |
| |
| # Reconstitute input. This is convoluted. Maybe we should just |
| # record it on the way in... |
| chomp( my $raw = $yaml->get_raw ); |
| $raw =~ s/^/$pad/mg; |
| |
| return { |
| type => 'yaml', |
| raw => $raw, |
| data => $data |
| }; |
| } |
| |
| sub _make_pragma_token { |
| my ( $self, $line, $pragmas ) = @_; |
| return { |
| type => 'pragma', |
| raw => $line, |
| pragmas => [ split /\s*,\s*/, _trim($pragmas) ], |
| }; |
| } |
| |
| sub _trim { |
| my $data = shift; |
| |
| return '' unless defined $data; |
| |
| $data =~ s/^\s+//; |
| $data =~ s/\s+$//; |
| return $data; |
| } |
| |
| 1; |
| |
| =head1 TAP GRAMMAR |
| |
| B<NOTE:> This grammar is slightly out of date. There's still some discussion |
| about it and a new one will be provided when we have things better defined. |
| |
| The L<TAP::Parser> does not use a formal grammar because TAP is essentially a |
| stream-based protocol. In fact, it's quite legal to have an infinite stream. |
| For the same reason that we don't apply regexes to streams, we're not using a |
| formal grammar here. Instead, we parse the TAP in lines. |
| |
| For purposes for forward compatibility, any result which does not match the |
| following grammar is currently referred to as |
| L<TAP::Parser::Result::Unknown>. It is I<not> a parse error. |
| |
| A formal grammar would look similar to the following: |
| |
| (* |
| For the time being, I'm cheating on the EBNF by allowing |
| certain terms to be defined by POSIX character classes by |
| using the following syntax: |
| |
| digit ::= [:digit:] |
| |
| As far as I am aware, that's not valid EBNF. Sue me. I |
| didn't know how to write "char" otherwise (Unicode issues). |
| Suggestions welcome. |
| *) |
| |
| tap ::= version? { comment | unknown } leading_plan lines |
| | |
| lines trailing_plan {comment} |
| |
| version ::= 'TAP version ' positiveInteger {positiveInteger} "\n" |
| |
| leading_plan ::= plan skip_directive? "\n" |
| |
| trailing_plan ::= plan "\n" |
| |
| plan ::= '1..' nonNegativeInteger |
| |
| lines ::= line {line} |
| |
| line ::= (comment | test | unknown | bailout ) "\n" |
| |
| test ::= status positiveInteger? description? directive? |
| |
| status ::= 'not '? 'ok ' |
| |
| description ::= (character - (digit | '#')) {character - '#'} |
| |
| directive ::= todo_directive | skip_directive |
| |
| todo_directive ::= hash_mark 'TODO' ' ' {character} |
| |
| skip_directive ::= hash_mark 'SKIP' ' ' {character} |
| |
| comment ::= hash_mark {character} |
| |
| hash_mark ::= '#' {' '} |
| |
| bailout ::= 'Bail out!' {character} |
| |
| unknown ::= { (character - "\n") } |
| |
| (* POSIX character classes and other terminals *) |
| |
| digit ::= [:digit:] |
| character ::= ([:print:] - "\n") |
| positiveInteger ::= ( digit - '0' ) {digit} |
| nonNegativeInteger ::= digit {digit} |
| |
| =head1 SUBCLASSING |
| |
| Please see L<TAP::Parser/SUBCLASSING> for a subclassing overview. |
| |
| If you I<really> want to subclass L<TAP::Parser>'s grammar the best thing to |
| do is read through the code. There's no easy way of summarizing it here. |
| |
| =head1 SEE ALSO |
| |
| L<TAP::Object>, |
| L<TAP::Parser>, |
| L<TAP::Parser::Iterator>, |
| L<TAP::Parser::Result>, |
| |
| =cut |