blob: d194e2d9f6784c49cccb816dadef680375af53db [file] [log] [blame]
/*
* Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package jdk.nashorn.internal.runtime.regexp;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.PatternSyntaxException;
import jdk.nashorn.internal.parser.Lexer;
import jdk.nashorn.internal.parser.Scanner;
import jdk.nashorn.internal.runtime.BitVector;
/**
* Scan a JavaScript regexp, converting to Java regex if necessary.
*
*/
final class RegExpScanner extends Scanner {
/**
* String builder used to rewrite the pattern for the currently used regexp factory.
*/
private final StringBuilder sb;
/** Expected token table */
private final Map<Character, Integer> expected = new HashMap<>();
/** Capturing parenthesis that have been found so far. */
private final List<Capture> caps = new LinkedList<>();
/** Forward references to capturing parenthesis to be resolved later.*/
private final LinkedList<Integer> forwardReferences = new LinkedList<>();
/** Current level of zero-width negative lookahead assertions. */
private int negLookaheadLevel;
/** Sequential id of current top-level zero-width negative lookahead assertion. */
private int negLookaheadGroup;
/** Are we currently inside a character class? */
private boolean inCharClass = false;
/** Are we currently inside a negated character class? */
private boolean inNegativeClass = false;
private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?-";
private static class Capture {
/** Zero-width negative lookaheads enclosing the capture. */
private final int negLookaheadLevel;
/** Sequential id of top-level negative lookaheads containing the capture. */
private final int negLookaheadGroup;
Capture(final int negLookaheadGroup, final int negLookaheadLevel) {
this.negLookaheadGroup = negLookaheadGroup;
this.negLookaheadLevel = negLookaheadLevel;
}
/**
* Returns true if this Capture can be referenced from the position specified by the
* group and level parameters. This is the case if either the group is not within
* a negative lookahead, or the position of the referrer is in the same negative lookahead.
*
* @param group current negative lookahead group
* @param level current negative lokahead level
* @return true if this capture group can be referenced from the given position
*/
boolean canBeReferencedFrom(final int group, final int level) {
return this.negLookaheadLevel == 0 || (group == this.negLookaheadGroup && level >= this.negLookaheadLevel);
}
}
/**
* Constructor
* @param string the JavaScript regexp to parse
*/
private RegExpScanner(final String string) {
super(string);
sb = new StringBuilder(limit);
reset(0);
expected.put(']', 0);
expected.put('}', 0);
}
private void processForwardReferences() {
final Iterator<Integer> iterator = forwardReferences.descendingIterator();
while (iterator.hasNext()) {
final int pos = iterator.next();
final int num = iterator.next();
if (num > caps.size()) {
// Non-existing backreference. If the number begins with a valid octal convert it to
// Unicode escape and append the rest to a literal character sequence.
final StringBuilder buffer = new StringBuilder();
octalOrLiteral(Integer.toString(num), buffer);
sb.insert(pos, buffer);
}
}
forwardReferences.clear();
}
/**
* Scan a JavaScript regexp string returning a Java safe regex string.
*
* @param string
* JavaScript regexp string.
* @return Java safe regex string.
*/
public static RegExpScanner scan(final String string) {
final RegExpScanner scanner = new RegExpScanner(string);
try {
scanner.disjunction();
} catch (final Exception e) {
throw new PatternSyntaxException(e.getMessage(), string, scanner.position);
}
scanner.processForwardReferences();
// Throw syntax error unless we parsed the entire JavaScript regexp without syntax errors
if (scanner.position != string.length()) {
final String p = scanner.getStringBuilder().toString();
throw new PatternSyntaxException(string, p, p.length() + 1);
}
return scanner;
}
final StringBuilder getStringBuilder() {
return sb;
}
String getJavaPattern() {
return sb.toString();
}
BitVector getGroupsInNegativeLookahead() {
BitVector vec = null;
for (int i = 0; i < caps.size(); i++) {
final Capture cap = caps.get(i);
if (cap.negLookaheadLevel > 0) {
if (vec == null) {
vec = new BitVector(caps.size() + 1);
}
vec.set(i + 1);
}
}
return vec;
}
/**
* Commit n characters to the builder and to a given token
* @param n Number of characters.
* @return Committed token
*/
private boolean commit(final int n) {
switch (n) {
case 1:
sb.append(ch0);
skip(1);
break;
case 2:
sb.append(ch0);
sb.append(ch1);
skip(2);
break;
case 3:
sb.append(ch0);
sb.append(ch1);
sb.append(ch2);
skip(3);
break;
default:
assert false : "Should not reach here";
}
return true;
}
/**
* Restart the buffers back at an earlier position.
*
* @param startIn
* Position in the input stream.
* @param startOut
* Position in the output stream.
*/
private void restart(final int startIn, final int startOut) {
reset(startIn);
sb.setLength(startOut);
}
private void push(final char ch) {
expected.put(ch, expected.get(ch) + 1);
}
private void pop(final char ch) {
expected.put(ch, Math.min(0, expected.get(ch) - 1));
}
/*
* Recursive descent tokenizer starts below.
*/
/*
* Disjunction ::
* Alternative
* Alternative | Disjunction
*/
private void disjunction() {
while (true) {
alternative();
if (ch0 == '|') {
commit(1);
} else {
break;
}
}
}
/*
* Alternative ::
* [empty]
* Alternative Term
*/
private void alternative() {
while (term()) {
// do nothing
}
}
/*
* Term ::
* Assertion
* Atom
* Atom Quantifier
*/
private boolean term() {
final int startIn = position;
final int startOut = sb.length();
if (assertion()) {
return true;
}
if (atom()) {
quantifier();
return true;
}
restart(startIn, startOut);
return false;
}
/*
* Assertion ::
* ^
* $
* \b
* \B
* ( ? = Disjunction )
* ( ? ! Disjunction )
*/
private boolean assertion() {
final int startIn = position;
final int startOut = sb.length();
switch (ch0) {
case '^':
case '$':
return commit(1);
case '\\':
if (ch1 == 'b' || ch1 == 'B') {
return commit(2);
}
break;
case '(':
if (ch1 != '?') {
break;
}
if (ch2 != '=' && ch2 != '!') {
break;
}
final boolean isNegativeLookahead = (ch2 == '!');
commit(3);
if (isNegativeLookahead) {
if (negLookaheadLevel == 0) {
negLookaheadGroup++;
}
negLookaheadLevel++;
}
disjunction();
if (isNegativeLookahead) {
negLookaheadLevel--;
}
if (ch0 == ')') {
return commit(1);
}
break;
default:
break;
}
restart(startIn, startOut);
return false;
}
/*
* Quantifier ::
* QuantifierPrefix
* QuantifierPrefix ?
*/
private boolean quantifier() {
if (quantifierPrefix()) {
if (ch0 == '?') {
commit(1);
}
return true;
}
return false;
}
/*
* QuantifierPrefix ::
* *
* +
* ?
* { DecimalDigits }
* { DecimalDigits , }
* { DecimalDigits , DecimalDigits }
*/
private boolean quantifierPrefix() {
final int startIn = position;
final int startOut = sb.length();
switch (ch0) {
case '*':
case '+':
case '?':
return commit(1);
case '{':
commit(1);
if (!decimalDigits()) {
break; // not a quantifier - back out
}
push('}');
if (ch0 == ',') {
commit(1);
decimalDigits();
}
if (ch0 == '}') {
pop('}');
commit(1);
} else {
// Bad quantifier should be rejected but is accepted by all major engines
restart(startIn, startOut);
return false;
}
return true;
default:
break;
}
restart(startIn, startOut);
return false;
}
/*
* Atom ::
* PatternCharacter
* .
* \ AtomEscape
* CharacterClass
* ( Disjunction )
* ( ? : Disjunction )
*
*/
private boolean atom() {
final int startIn = position;
final int startOut = sb.length();
if (patternCharacter()) {
return true;
}
if (ch0 == '.') {
return commit(1);
}
if (ch0 == '\\') {
commit(1);
if (atomEscape()) {
return true;
}
}
if (characterClass()) {
return true;
}
if (ch0 == '(') {
commit(1);
if (ch0 == '?' && ch1 == ':') {
commit(2);
} else {
caps.add(new Capture(negLookaheadGroup, negLookaheadLevel));
}
disjunction();
if (ch0 == ')') {
commit(1);
return true;
}
}
restart(startIn, startOut);
return false;
}
/*
* PatternCharacter ::
* SourceCharacter but not any of: ^$\.*+?()[]{}|
*/
@SuppressWarnings("fallthrough")
private boolean patternCharacter() {
if (atEOF()) {
return false;
}
switch (ch0) {
case '^':
case '$':
case '\\':
case '.':
case '*':
case '+':
case '?':
case '(':
case ')':
case '[':
case '|':
return false;
case '}':
case ']':
final int n = expected.get(ch0);
if (n != 0) {
return false;
}
case '{':
// if not a valid quantifier escape curly brace to match itself
// this ensures compatibility with other JS implementations
if (!quantifierPrefix()) {
sb.append('\\');
return commit(1);
}
return false;
default:
return commit(1); // SOURCECHARACTER
}
}
/*
* AtomEscape ::
* DecimalEscape
* CharacterEscape
* CharacterClassEscape
*/
private boolean atomEscape() {
// Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all
return decimalEscape() || characterClassEscape() || characterEscape() || identityEscape();
}
/*
* CharacterEscape ::
* ControlEscape
* c ControlLetter
* HexEscapeSequence
* UnicodeEscapeSequence
* IdentityEscape
*/
private boolean characterEscape() {
final int startIn = position;
final int startOut = sb.length();
if (controlEscape()) {
return true;
}
if (ch0 == 'c') {
commit(1);
if (controlLetter()) {
return true;
}
restart(startIn, startOut);
}
if (hexEscapeSequence() || unicodeEscapeSequence()) {
return true;
}
restart(startIn, startOut);
return false;
}
private boolean scanEscapeSequence(final char leader, final int length) {
final int startIn = position;
final int startOut = sb.length();
if (ch0 != leader) {
return false;
}
commit(1);
for (int i = 0; i < length; i++) {
final char ch0l = Character.toLowerCase(ch0);
if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) {
commit(1);
} else {
restart(startIn, startOut);
return false;
}
}
return true;
}
private boolean hexEscapeSequence() {
return scanEscapeSequence('x', 2);
}
private boolean unicodeEscapeSequence() {
return scanEscapeSequence('u', 4);
}
/*
* ControlEscape ::
* one of fnrtv
*/
private boolean controlEscape() {
switch (ch0) {
case 'f':
case 'n':
case 'r':
case 't':
case 'v':
return commit(1);
default:
return false;
}
}
/*
* ControlLetter ::
* one of abcdefghijklmnopqrstuvwxyz
* ABCDEFGHIJKLMNOPQRSTUVWXYZ
*/
private boolean controlLetter() {
// To match other engines we also accept '0'..'9' and '_' as control letters inside a character class.
if ((ch0 >= 'A' && ch0 <= 'Z') || (ch0 >= 'a' && ch0 <= 'z')
|| (inCharClass && (isDecimalDigit(ch0) || ch0 == '_'))) {
// for some reason java regexps don't like control characters on the
// form "\\ca".match([string with ascii 1 at char0]). Translating
// them to unicode does it though.
sb.setLength(sb.length() - 1);
unicode(ch0 % 32, sb);
skip(1);
return true;
}
return false;
}
/*
* IdentityEscape ::
* SourceCharacter but not IdentifierPart
* <ZWJ> (200c)
* <ZWNJ> (200d)
*/
private boolean identityEscape() {
if (atEOF()) {
throw new RuntimeException("\\ at end of pattern"); // will be converted to PatternSyntaxException
}
// ES 5.1 A.7 requires "not IdentifierPart" here but all major engines accept any character here.
if (ch0 == 'c') {
sb.append('\\'); // Treat invalid \c control sequence as \\c
} else if (NON_IDENT_ESCAPES.indexOf(ch0) == -1) {
sb.setLength(sb.length() - 1);
}
return commit(1);
}
/*
* DecimalEscape ::
* DecimalIntegerLiteral [lookahead DecimalDigit]
*/
private boolean decimalEscape() {
final int startIn = position;
final int startOut = sb.length();
if (ch0 == '0' && !isOctalDigit(ch1)) {
skip(1);
// DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000);
sb.append("\u0000");
return true;
}
if (isDecimalDigit(ch0)) {
if (ch0 == '0') {
// We know this is an octal escape.
if (inCharClass) {
// Convert octal escape to unicode escape if inside character class.
int octalValue = 0;
while (isOctalDigit(ch0)) {
octalValue = octalValue * 8 + ch0 - '0';
skip(1);
}
unicode(octalValue, sb);
} else {
// Copy decimal escape as-is
decimalDigits();
}
} else {
// This should be a backreference, but could also be an octal escape or even a literal string.
int decimalValue = 0;
while (isDecimalDigit(ch0)) {
decimalValue = decimalValue * 10 + ch0 - '0';
skip(1);
}
if (inCharClass) {
// No backreferences in character classes. Encode as unicode escape or literal char sequence
sb.setLength(sb.length() - 1);
octalOrLiteral(Integer.toString(decimalValue), sb);
} else if (decimalValue <= caps.size()) {
// Captures inside a negative lookahead are undefined when referenced from the outside.
final Capture capture = caps.get(decimalValue - 1);
if (!capture.canBeReferencedFrom(negLookaheadGroup, negLookaheadLevel)) {
// Outside reference to capture in negative lookahead, omit from output buffer.
sb.setLength(sb.length() - 1);
} else {
// Append backreference to output buffer.
sb.append(decimalValue);
}
} else {
// Forward references to a capture group are always undefined so we can omit it from the output buffer.
// However, if the target capture does not exist, we need to rewrite the reference as hex escape
// or literal string, so register the reference for later processing.
sb.setLength(sb.length() - 1);
forwardReferences.add(decimalValue);
forwardReferences.add(sb.length());
}
}
return true;
}
restart(startIn, startOut);
return false;
}
/*
* CharacterClassEscape ::
* one of dDsSwW
*/
private boolean characterClassEscape() {
switch (ch0) {
// java.util.regex requires translation of \s and \S to explicit character list
case 's':
if (RegExpFactory.usesJavaUtilRegex()) {
sb.setLength(sb.length() - 1);
// No nested class required if we already are inside a character class
if (inCharClass) {
sb.append(Lexer.getWhitespaceRegExp());
} else {
sb.append('[').append(Lexer.getWhitespaceRegExp()).append(']');
}
skip(1);
return true;
}
return commit(1);
case 'S':
if (RegExpFactory.usesJavaUtilRegex()) {
sb.setLength(sb.length() - 1);
// In negative class we must use intersection to get double negation ("not anything else than space")
sb.append(inNegativeClass ? "&&[" : "[^").append(Lexer.getWhitespaceRegExp()).append(']');
skip(1);
return true;
}
return commit(1);
case 'd':
case 'D':
case 'w':
case 'W':
return commit(1);
default:
return false;
}
}
/*
* CharacterClass ::
* [ [lookahead {^}] ClassRanges ]
* [ ^ ClassRanges ]
*/
private boolean characterClass() {
final int startIn = position;
final int startOut = sb.length();
if (ch0 == '[') {
try {
inCharClass = true;
push(']');
commit(1);
if (ch0 == '^') {
inNegativeClass = true;
commit(1);
}
if (classRanges() && ch0 == ']') {
pop(']');
commit(1);
// Substitute empty character classes [] and [^] that never or always match
if (position == startIn + 2) {
sb.setLength(sb.length() - 1);
sb.append("^\\s\\S]");
} else if (position == startIn + 3 && inNegativeClass) {
sb.setLength(sb.length() - 2);
sb.append("\\s\\S]");
}
return true;
}
} finally {
inCharClass = false; // no nested character classes in JavaScript
inNegativeClass = false;
}
}
restart(startIn, startOut);
return false;
}
/*
* ClassRanges ::
* [empty]
* NonemptyClassRanges
*/
private boolean classRanges() {
nonemptyClassRanges();
return true;
}
/*
* NonemptyClassRanges ::
* ClassAtom
* ClassAtom NonemptyClassRangesNoDash
* ClassAtom - ClassAtom ClassRanges
*/
private boolean nonemptyClassRanges() {
final int startIn = position;
final int startOut = sb.length();
if (classAtom()) {
if (ch0 == '-') {
commit(1);
if (classAtom() && classRanges()) {
return true;
}
}
nonemptyClassRangesNoDash();
return true;
}
restart(startIn, startOut);
return false;
}
/*
* NonemptyClassRangesNoDash ::
* ClassAtom
* ClassAtomNoDash NonemptyClassRangesNoDash
* ClassAtomNoDash - ClassAtom ClassRanges
*/
private boolean nonemptyClassRangesNoDash() {
final int startIn = position;
final int startOut = sb.length();
if (classAtomNoDash()) {
// need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom
if (ch0 == '-') {
commit(1);
if (classAtom() && classRanges()) {
return true;
}
//fallthru
}
nonemptyClassRangesNoDash();
return true; // still a class atom
}
if (classAtom()) {
return true;
}
restart(startIn, startOut);
return false;
}
/*
* ClassAtom : - ClassAtomNoDash
*/
private boolean classAtom() {
if (ch0 == '-') {
return commit(1);
}
return classAtomNoDash();
}
/*
* ClassAtomNoDash ::
* SourceCharacter but not one of \ or ] or -
* \ ClassEscape
*/
private boolean classAtomNoDash() {
if (atEOF()) {
return false;
}
final int startIn = position;
final int startOut = sb.length();
switch (ch0) {
case ']':
case '-':
return false;
case '[':
// unescaped left square bracket - add escape
sb.append('\\');
return commit(1);
case '\\':
commit(1);
if (classEscape()) {
return true;
}
restart(startIn, startOut);
return false;
default:
return commit(1);
}
}
/*
* ClassEscape ::
* DecimalEscape
* b
* CharacterEscape
* CharacterClassEscape
*/
private boolean classEscape() {
if (decimalEscape()) {
return true;
}
if (ch0 == 'b') {
sb.setLength(sb.length() - 1);
sb.append('\b');
skip(1);
return true;
}
// Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all
return characterEscape() || characterClassEscape() || identityEscape();
}
/*
* DecimalDigits
*/
private boolean decimalDigits() {
if (!isDecimalDigit(ch0)) {
return false;
}
while (isDecimalDigit(ch0)) {
commit(1);
}
return true;
}
private static void unicode(final int value, final StringBuilder buffer) {
final String hex = Integer.toHexString(value);
buffer.append('u');
for (int i = 0; i < 4 - hex.length(); i++) {
buffer.append('0');
}
buffer.append(hex);
}
// Convert what would have been a backreference into a unicode escape, or a number literal, or both.
private static void octalOrLiteral(final String numberLiteral, final StringBuilder buffer) {
final int length = numberLiteral.length();
int octalValue = 0;
int pos = 0;
// Maximum value for octal escape is 0377 (255) so we stop the loop at 32
while (pos < length && octalValue < 0x20) {
final char ch = numberLiteral.charAt(pos);
if (isOctalDigit(ch)) {
octalValue = octalValue * 8 + ch - '0';
} else {
break;
}
pos++;
}
if (octalValue > 0) {
buffer.append('\\');
unicode(octalValue, buffer);
buffer.append(numberLiteral.substring(pos));
} else {
buffer.append(numberLiteral);
}
}
private static boolean isOctalDigit(final char ch) {
return ch >= '0' && ch <= '7';
}
private static boolean isDecimalDigit(final char ch) {
return ch >= '0' && ch <= '9';
}
}