blob: 31fa2037ba141cbf6ff653b132b350088c8dbdf1 [file] [log] [blame]
lexer grammar t012lexerXMLLexer;
options {
language = Python;
}
@header {
from cStringIO import StringIO
}
@lexer::init {
self.outbuf = StringIO()
}
@lexer::members {
def output(self, line):
self.outbuf.write(line.encode('utf-8') + "\n")
}
DOCUMENT
: XMLDECL? WS? DOCTYPE? WS? ELEMENT WS?
;
fragment DOCTYPE
:
'<!DOCTYPE' WS rootElementName=GENERIC_ID
{self.output("ROOTELEMENT: "+rootElementName.text)}
WS
(
( 'SYSTEM' WS sys1=VALUE
{self.output("SYSTEM: "+sys1.text)}
| 'PUBLIC' WS pub=VALUE WS sys2=VALUE
{self.output("PUBLIC: "+pub.text)}
{self.output("SYSTEM: "+sys2.text)}
)
( WS )?
)?
( dtd=INTERNAL_DTD
{self.output("INTERNAL DTD: "+dtd.text)}
)?
'>'
;
fragment INTERNAL_DTD : '[' (options {greedy=false;} : .)* ']' ;
fragment PI :
'<?' target=GENERIC_ID WS?
{self.output("PI: "+target.text)}
( ATTRIBUTE WS? )* '?>'
;
fragment XMLDECL :
'<?' ('x'|'X') ('m'|'M') ('l'|'L') WS?
{self.output("XML declaration")}
( ATTRIBUTE WS? )* '?>'
;
fragment ELEMENT
: ( START_TAG
(ELEMENT
| t=PCDATA
{self.output("PCDATA: \""+$t.text+"\"")}
| t=CDATA
{self.output("CDATA: \""+$t.text+"\"")}
| t=COMMENT
{self.output("Comment: \""+$t.text+"\"")}
| pi=PI
)*
END_TAG
| EMPTY_ELEMENT
)
;
fragment START_TAG
: '<' WS? name=GENERIC_ID WS?
{self.output("Start Tag: "+name.text)}
( ATTRIBUTE WS? )* '>'
;
fragment EMPTY_ELEMENT
: '<' WS? name=GENERIC_ID WS?
{self.output("Empty Element: "+name.text)}
( ATTRIBUTE WS? )* '/>'
;
fragment ATTRIBUTE
: name=GENERIC_ID WS? '=' WS? value=VALUE
{self.output("Attr: "+name.text+"="+value.text)}
;
fragment END_TAG
: '</' WS? name=GENERIC_ID WS? '>'
{self.output("End Tag: "+name.text)}
;
fragment COMMENT
: '<!--' (options {greedy=false;} : .)* '-->'
;
fragment CDATA
: '<![CDATA[' (options {greedy=false;} : .)* ']]>'
;
fragment PCDATA : (~'<')+ ;
fragment VALUE :
( '\"' (~'\"')* '\"'
| '\'' (~'\'')* '\''
)
;
fragment GENERIC_ID
: ( LETTER | '_' | ':')
( options {greedy=true;} : LETTER | '0'..'9' | '.' | '-' | '_' | ':' )*
;
fragment LETTER
: 'a'..'z'
| 'A'..'Z'
;
fragment WS :
( ' '
| '\t'
| ( '\n'
| '\r\n'
| '\r'
)
)+
;