blob: 1737148b6cdaaed5036247679fe63f9fbc9fb33b [file] [log] [blame]
/* It's an automatically generated code. Do not modify it. */
package org.intellij.lang.regexp;
import com.intellij.lexer.FlexLexer;
import com.intellij.psi.tree.IElementType;
import java.util.LinkedList;
import java.util.EnumSet;
import com.intellij.psi.StringEscapesTokenTypes;
// IDEADEV-11055
@SuppressWarnings({ "ALL", "SameParameterValue", "WeakerAccess", "SameReturnValue", "RedundantThrows", "UnusedDeclaration", "UnusedDeclaration" })
%%
%class _RegExLexer
%implements FlexLexer
%unicode
%function advance
%type IElementType
%eof{ return;
%eof}
%{
// This adds support for nested states. I'm no JFlex pro, so maybe this is overkill, but it works quite well.
private final LinkedList<Integer> states = new LinkedList();
// This was an idea to use the regex implementation for XML schema regexes (which use a slightly different syntax)
// as well, but is currently unfinished as it requires to tweak more places than just the lexer.
private boolean xmlSchemaMode;
private boolean allowDanglingMetacharacters;
private boolean allowNestedCharacterClasses;
private boolean allowOctalNoLeadingZero;
private boolean allowHexDigitClass;
private boolean allowEmptyCharacterClass;
_RegExLexer(EnumSet<RegExpCapability> capabilities) {
this((java.io.Reader)null);
this.xmlSchemaMode = capabilities.contains(RegExpCapability.XML_SCHEMA_MODE);
this.allowDanglingMetacharacters = capabilities.contains(RegExpCapability.DANGLING_METACHARACTERS);
this.allowNestedCharacterClasses = capabilities.contains(RegExpCapability.NESTED_CHARACTER_CLASSES);
this.allowOctalNoLeadingZero = capabilities.contains(RegExpCapability.OCTAL_NO_LEADING_ZERO);
this.commentMode = capabilities.contains(RegExpCapability.COMMENT_MODE);
this.allowHexDigitClass = capabilities.contains(RegExpCapability.ALLOW_HEX_DIGIT_CLASS);
this.allowEmptyCharacterClass = capabilities.contains(RegExpCapability.ALLOW_EMPTY_CHARACTER_CLASS);
}
private void yypushstate(int state) {
states.addFirst(yystate());
yybegin(state);
}
private void yypopstate() {
final int state = states.removeFirst();
yybegin(state);
}
private void handleOptions() {
final String o = yytext().toString();
if (o.contains("x")) {
commentMode = !o.startsWith("-");
}
}
// tracks whether the lexer is in comment mode, i.e. whether whitespace is not significant and whether to ignore
// text after '#' till EOL
boolean commentMode = false;
%}
%xstate QUOTED
%xstate EMBRACED
%xstate CLASS1
%xstate CLASS1PY
%state CLASS2
%state PROP
%xstate OPTIONS
%xstate COMMENT
%xstate NAMED_GROUP
%xstate QUOTED_NAMED_GROUP
%xstate PY_NAMED_GROUP_REF
%xstate PY_COND_REF
DIGITS=[1-9][0-9]*
DOT="."
LPAREN="("
RPAREN=")"
LBRACE="{"
RBRACE="}"
LBRACKET="["
RBRACKET="]"
ESCAPE="\\"
ANY=.|\n
META={ESCAPE} | {DOT} |
"^" | "$" | "?" | "*" | "+" | "|" |
{LBRACKET} | {LBRACE} | {LPAREN} | {RPAREN}
CONTROL="t" | "n" | "r" | "f" | "a" | "e"
BOUNDARY="b" | "B" | "A" | "z" | "Z" | "G"
CLASS="w" | "W" | "s" | "S" | "d" | "D" | "X" | "C"
XML_CLASS="c" | "C" | "i" | "I"
PROP="p" | "P"
HEX_CHAR=[0-9a-fA-F]
%%
"\\Q" { yypushstate(QUOTED); return RegExpTT.QUOTE_BEGIN; }
<QUOTED> {
"\\E" { yypopstate(); return RegExpTT.QUOTE_END; }
{ANY} { return RegExpTT.CHARACTER; }
}
/* \\ */
{ESCAPE} {ESCAPE} { return RegExpTT.ESC_CHARACTER; }
/* hex escapes */
{ESCAPE} "x" {HEX_CHAR}{2} { return RegExpTT.HEX_CHAR; }
{ESCAPE} "x" {ANY}{0,2} { return RegExpTT.BAD_HEX_VALUE; }
/* unicode escapes */
{ESCAPE} "u" {HEX_CHAR}{4} { return RegExpTT.UNICODE_CHAR; }
{ESCAPE} "u" {ANY}{0,4} { return StringEscapesTokenTypes.INVALID_UNICODE_ESCAPE_TOKEN; }
/* octal escapes */
{ESCAPE} "0" [0-7]{1,3} { return RegExpTT.OCT_CHAR; }
{ESCAPE} "0" { return (allowOctalNoLeadingZero ? RegExpTT.OCT_CHAR : RegExpTT.BAD_OCT_VALUE); }
/* single character after "\c" */
{ESCAPE} "c" {ANY} { if (xmlSchemaMode) { yypushback(1); return RegExpTT.CHAR_CLASS; } else return RegExpTT.CTRL; }
{ESCAPE} {XML_CLASS} { if (xmlSchemaMode) return RegExpTT.CHAR_CLASS; else return StringEscapesTokenTypes.INVALID_CHARACTER_ESCAPE_TOKEN; }
/* java.util.regex.Pattern says about backrefs:
"In this class, \1 through \9 are always interpreted as back references,
and a larger number is accepted as a back reference if at least that many
subexpressions exist at that point in the regular expression, otherwise the
parser will drop digits until the number is smaller or equal to the existing
number of groups or it is one digit."
So, for 100% compatibility, backrefs > 9 should be resolved by the parser, but
I'm not sure if it's worth the effort - at least not atm.
*/
{ESCAPE} [0-7]{3} { if (allowOctalNoLeadingZero) return RegExpTT.OCT_CHAR;
return yystate() != CLASS2 ? RegExpTT.BACKREF : RegExpTT.ESC_CHARACTER;
}
{ESCAPE} {DIGITS} { return yystate() != CLASS2 ? RegExpTT.BACKREF : RegExpTT.ESC_CHARACTER; }
{ESCAPE} "-" { return RegExpTT.ESC_CHARACTER; }
{ESCAPE} {META} { return RegExpTT.ESC_CHARACTER; }
{ESCAPE} {CLASS} { return RegExpTT.CHAR_CLASS; }
{ESCAPE} {PROP} { yypushstate(PROP); return RegExpTT.PROPERTY; }
{ESCAPE} {BOUNDARY} { return yystate() != CLASS2 ? RegExpTT.BOUNDARY : RegExpTT.ESC_CHARACTER; }
{ESCAPE} {CONTROL} { return RegExpTT.ESC_CTRL_CHARACTER; }
{ESCAPE} [hH] { return (allowHexDigitClass ? RegExpTT.CHAR_CLASS : StringEscapesTokenTypes.INVALID_CHARACTER_ESCAPE_TOKEN); }
{ESCAPE} [:letter:] { return StringEscapesTokenTypes.INVALID_CHARACTER_ESCAPE_TOKEN; }
{ESCAPE} [\n\b\t\r\f ] { return commentMode ? RegExpTT.CHARACTER : RegExpTT.REDUNDANT_ESCAPE; }
<CLASS2> {
{ESCAPE} {RBRACKET} { if (!allowNestedCharacterClasses) return RegExpTT.CHARACTER;
return RegExpTT.REDUNDANT_ESCAPE; }
}
{ESCAPE} {ANY} { return RegExpTT.REDUNDANT_ESCAPE; }
{ESCAPE} { return StringEscapesTokenTypes.INVALID_CHARACTER_ESCAPE_TOKEN; }
<PROP> {
{LBRACE} { yypopstate(); yypushstate(EMBRACED); return RegExpTT.LBRACE; }
{ANY} { yypopstate(); yypushback(1); }
}
/* "{" \d+(,\d*)? "}" */
/* "}" outside counted closure is treated as regular character */
{LBRACE} { if (yystate() != CLASS2) yypushstate(EMBRACED); return RegExpTT.LBRACE; }
<EMBRACED> {
[:letter:]([:letter:]|_|[:digit:])* { return RegExpTT.NAME; }
[:digit:]+ { return RegExpTT.NUMBER; }
"," { return RegExpTT.COMMA; }
{RBRACE} { yypopstate(); return RegExpTT.RBRACE; }
{ANY} { if (allowDanglingMetacharacters) {
yypopstate(); yypushback(1);
} else {
return RegExpTT.BAD_CHARACTER;
}
}
}
"-" { return RegExpTT.MINUS; }
"^" { return RegExpTT.CARET; }
<CLASS2> {
{LBRACKET} { if (allowNestedCharacterClasses) {
yypushstate(CLASS2);
return RegExpTT.CLASS_BEGIN;
}
return RegExpTT.CHARACTER;
}
{LBRACKET} / {RBRACKET} { if (allowNestedCharacterClasses) {
yypushstate(CLASS1);
return RegExpTT.CLASS_BEGIN;
}
return RegExpTT.CHARACTER;
}
}
{LBRACKET} / {RBRACKET} { yypushstate(CLASS1);
return RegExpTT.CLASS_BEGIN; }
/* Python understands that, Java doesn't */
{LBRACKET} / "^" {RBRACKET} { if (allowEmptyCharacterClass) {
yypushstate(CLASS1PY);
}
else {
yypushstate(CLASS2);
}
return RegExpTT.CLASS_BEGIN;
}
{LBRACKET} { yypushstate(CLASS2);
return RegExpTT.CLASS_BEGIN; }
/* []abc] is legal. The first ] is treated as literal character */
<CLASS1> {
{RBRACKET} { yybegin(CLASS2); return RegExpTT.CHARACTER; }
. { assert false : yytext(); }
}
<CLASS1PY> {
"^" { yybegin(CLASS1); return RegExpTT.CARET; }
. { assert false : yytext(); }
}
<CLASS2> {
{RBRACKET} { yypopstate(); return RegExpTT.CLASS_END; }
"&&" { return allowNestedCharacterClasses ? RegExpTT.ANDAND : RegExpTT.CHARACTER; }
[\n\b\t\r\f] { return commentMode ? com.intellij.psi.TokenType.WHITE_SPACE : RegExpTT.ESC_CHARACTER; }
{ANY} { return RegExpTT.CHARACTER; }
}
<YYINITIAL> {
{LPAREN} { return RegExpTT.GROUP_BEGIN; }
{RPAREN} { return RegExpTT.GROUP_END; }
"|" { return RegExpTT.UNION; }
"?" { return RegExpTT.QUEST; }
"*" { return RegExpTT.STAR; }
"+" { return RegExpTT.PLUS; }
"$" { return RegExpTT.DOLLAR; }
{DOT} { return RegExpTT.DOT; }
"(?:"|"(?>" { return RegExpTT.NON_CAPT_GROUP; }
"(?=" { return RegExpTT.POS_LOOKAHEAD; }
"(?!" { return RegExpTT.NEG_LOOKAHEAD; }
"(?<=" { return RegExpTT.POS_LOOKBEHIND; }
"(?<!" { return RegExpTT.NEG_LOOKBEHIND; }
"(?#" [^)]+ ")" { return RegExpTT.COMMENT; }
"(?P<" { yybegin(NAMED_GROUP); return RegExpTT.PYTHON_NAMED_GROUP; }
"(?P=" { yybegin(PY_NAMED_GROUP_REF); return RegExpTT.PYTHON_NAMED_GROUP_REF; }
"(?(" { yybegin(PY_COND_REF); return RegExpTT.PYTHON_COND_REF; }
"(?<" { yybegin(NAMED_GROUP); return RegExpTT.RUBY_NAMED_GROUP; }
"(?'" { yybegin(QUOTED_NAMED_GROUP); return RegExpTT.RUBY_QUOTED_NAMED_GROUP; }
"(?" { yybegin(OPTIONS); return RegExpTT.SET_OPTIONS; }
}
<OPTIONS> {
[:letter:]* { handleOptions(); return RegExpTT.OPTIONS_ON; }
("-" [:letter:]*) { handleOptions(); return RegExpTT.OPTIONS_OFF; }
":" { yybegin(YYINITIAL); return RegExpTT.COLON; }
")" { yybegin(YYINITIAL); return RegExpTT.GROUP_END; }
{ANY} { yybegin(YYINITIAL); return RegExpTT.BAD_CHARACTER; }
}
<NAMED_GROUP> {
[:letter:]([:letter:]|_|[:digit:])* { return RegExpTT.NAME; }
">" { yybegin(YYINITIAL); return RegExpTT.GT; }
{ANY} { yybegin(YYINITIAL); return RegExpTT.BAD_CHARACTER; }
}
<QUOTED_NAMED_GROUP> {
[:letter:]([:letter:]|_|[:digit:])* { return RegExpTT.NAME; }
"'" { yybegin(YYINITIAL); return RegExpTT.QUOTE; }
{ANY} { yybegin(YYINITIAL); return RegExpTT.BAD_CHARACTER; }
}
<PY_NAMED_GROUP_REF> {
[:letter:]([:letter:]|_|[:digit:])* { return RegExpTT.NAME; }
")" { yybegin(YYINITIAL); return RegExpTT.GROUP_END; }
{ANY} { yybegin(YYINITIAL); return RegExpTT.BAD_CHARACTER; }
}
<PY_COND_REF> {
[:letter:]([:letter:]|_|[:digit:])* { return RegExpTT.NAME; }
[:digit:]+ { return RegExpTT.NUMBER; }
")" { yybegin(YYINITIAL); return RegExpTT.GROUP_END; }
{ANY} { yybegin(YYINITIAL); return RegExpTT.BAD_CHARACTER; }
}
/* "dangling ]" */
<YYINITIAL> {RBRACKET} { return RegExpTT.CHARACTER; }
"#" { if (commentMode) { yypushstate(COMMENT); return RegExpTT.COMMENT; } else return RegExpTT.CHARACTER; }
<COMMENT> {
[^\r\n]*[\r\n]? { yypopstate(); return RegExpTT.COMMENT; }
}
" " { return commentMode ? com.intellij.psi.TokenType.WHITE_SPACE : RegExpTT.CHARACTER; }
[\n\b\t\r\f] { return commentMode ? com.intellij.psi.TokenType.WHITE_SPACE : RegExpTT.CTRL_CHARACTER; }
{ANY} { return RegExpTT.CHARACTER; }