blob: 1b6c7bc8ff42c8c16dd5f31926e9619b1e85363a [file] [log] [blame]
package org.unicode.cldr.draft;
import java.text.ParseException;
import java.text.ParsePosition;
import java.util.regex.Pattern;
import org.unicode.cldr.util.PatternCache;
/**
* Immutable class that allows people to fix regex pattern strings to be provide for internationalization support
* (as per UTS 18 Unicode Regular Expressions). The chief problem with the internationalization of
* regex engines is that character classes (eg "[\p{alphanum}@#$]") and property support in most
* regex engines are missing properties, have the wrong values for properties,
* have out-of-date values for properties, or are missing the syntax to
* combine properties or character classes (eg to get the assigned characters in the Devanagari block).
* The reason for supporting more than just Java is so that this can be use in build-time tools for generating
* fixed regex pattern strings.
* <p>
* TODO add options for controlling whether to change \w, \b, etc.
* <p>
* TODO be sensitive to COMMENTS
* <p>
* TODO add support for (?#)
*
* @author markdavis
*/
public class PatternFixer {
/**
* Regex engine type, will be added to over time: PERL, PYTHON, PCRE, and so on.
* <p>
* The reason for supporting more than just Java is so that this can be use in build-time tools for generating fixed
* regex pattern strings.
*/
public enum Target {
JAVA
}
/**
* Create for particular regex target.
*
* @param target
*/
public PatternFixer(Target target) {
this.target = target;
}
public Target getTarget() {
return target;
}
private enum State {
BASE, HAVE_SLASH, HAVE_Q, HAVE_Q_SLASH
};
/**
* Produce a modified pattern that fixes character classes. (See class description.)
*
* @param regexPattern
* @param patternOptions
* @return
*/
public String fix(String regexPattern, int patternOptions) {
// TODO optimize
// TODO handle (?#), #, ...
UnicodeSetBuilder builder = new UnicodeSetBuilder(); // target, patternOptions
ParsePosition parsePosition = new ParsePosition(0);
StringBuffer result = new StringBuffer();
State state = State.BASE;
for (int i = 0; i < regexPattern.length(); ++i) {
try {
char ch = regexPattern.charAt(i);
switch (state) {
case BASE:
switch (ch) {
case '\\':
state = State.HAVE_SLASH;
break;
case '[':
i = parseUnicodeSet(regexPattern, builder, parsePosition, result, i) - 1;
continue;
}
break;
case HAVE_SLASH:
switch (ch) {
case 'p':
case 'P':
case 'N':
i = parseUnicodeSet(regexPattern, builder, parsePosition, result, i) - 1;
continue;
case 'Q':
state = State.HAVE_Q;
break;
default:
state = State.BASE;
break;
}
break;
case HAVE_Q:
switch (ch) {
case '\\':
state = State.HAVE_Q_SLASH;
break;
}
break;
case HAVE_Q_SLASH:
switch (ch) {
case 'E':
state = State.BASE;
break;
default:
state = State.HAVE_Q;
}
break;
}
result.append(ch);
} catch (ParseException e) {
throw (IllegalArgumentException) new IllegalArgumentException().initCause(e);
}
}
return result.toString();
}
public String fix(String regexPattern) {
return fix(regexPattern, 0);
}
// convenience functions
public static Pattern compile(String regexPattern, int patternOptions) {
return Pattern.compile(new PatternFixer(Target.JAVA).fix(regexPattern, patternOptions), patternOptions);
}
public static Pattern compile(String regexPattern) {
return PatternCache.get(new PatternFixer(Target.JAVA).fix(regexPattern));
}
// convenience functions
public static String fixJava(String regexPattern, int patternOptions) {
return new PatternFixer(Target.JAVA).fix(regexPattern, patternOptions);
}
public static String fixJava(String regexPattern) {
return new PatternFixer(Target.JAVA).fix(regexPattern);
}
// =============== PRIVATES ========================
private Target target;
private int parseUnicodeSet(String regexPattern, UnicodeSetBuilder builder,
ParsePosition parsePosition, StringBuffer result, int i) throws ParseException {
return 0;
// UnicodeSet set;
// parsePosition.setIndex(i);
// set = builder.parseObject(regexPattern, parsePosition);
// if (parsePosition.getIndex() == i) {
// throw new ParseException(regexPattern, i);
// }
// builder.format(set,result,null);
// return parsePosition.getIndex();
}
}