| /* |
| * Permission is hereby granted, free of charge, to any person obtaining a copy of |
| * this software and associated documentation files (the "Software"), to deal in |
| * the Software without restriction, including without limitation the rights to |
| * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies |
| * of the Software, and to permit persons to whom the Software is furnished to do |
| * so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included in all |
| * copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| */ |
| package jdk.nashorn.internal.runtime.regexp.joni; |
| |
| import java.util.Arrays; |
| import jdk.nashorn.internal.runtime.regexp.joni.encoding.CharacterType; |
| import jdk.nashorn.internal.runtime.regexp.joni.encoding.IntHolder; |
| |
| @SuppressWarnings("javadoc") |
| public final class EncodingHelper { |
| |
| final static int NEW_LINE = 0x000a; |
| final static int RETURN = 0x000d; |
| final static int LINE_SEPARATOR = 0x2028; |
| final static int PARAGRAPH_SEPARATOR = 0x2029; |
| |
| final static char[] EMPTYCHARS = new char[0]; |
| final static int[][] codeRanges = new int[15][]; |
| |
| public static int digitVal(final int code) { |
| return code - '0'; |
| } |
| |
| public static int odigitVal(final int code) { |
| return digitVal(code); |
| } |
| |
| public static boolean isXDigit(final int code) { |
| return Character.isDigit(code) || (code >= 'a' && code <= 'f') || (code >= 'A' && code <= 'F'); |
| } |
| |
| public static int xdigitVal(final int code) { |
| if (Character.isDigit(code)) { |
| return code - '0'; |
| } else if (code >= 'a' && code <= 'f') { |
| return code - 'a' + 10; |
| } else { |
| return code - 'A' + 10; |
| } |
| } |
| |
| public static boolean isDigit(final int code) { |
| return code >= '0' && code <= '9'; |
| } |
| |
| public static boolean isWord(final int code) { |
| // letter, digit, or '_' |
| return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0; |
| } |
| |
| public static boolean isNewLine(final int code) { |
| return code == NEW_LINE || code == RETURN || code == LINE_SEPARATOR || code == PARAGRAPH_SEPARATOR; |
| } |
| |
| public static boolean isNewLine(final char[] chars, final int p, final int end) { |
| return p < end && isNewLine(chars[p]); |
| } |
| |
| // Encoding.prevCharHead |
| public static int prevCharHead(final int p, final int s) { |
| return s <= p ? -1 : s - 1; |
| } |
| |
| /* onigenc_get_right_adjust_char_head_with_prev */ |
| public static int rightAdjustCharHeadWithPrev(final int s, final IntHolder prev) { |
| if (prev != null) { |
| prev.value = -1; /* Sorry */ |
| } |
| return s; |
| } |
| |
| // Encoding.stepBack |
| public static int stepBack(final int p, final int sp, final int np) { |
| int s = sp, n = np; |
| while (s != -1 && n-- > 0) { |
| if (s <= p) { |
| return -1; |
| } |
| s--; |
| } |
| return s; |
| } |
| |
| public static int mbcodeStartPosition() { |
| return 0x80; |
| } |
| |
| public static char[] caseFoldCodesByString(final int flag, final char c) { |
| char[] codes = EMPTYCHARS; |
| final char upper = toUpperCase(c); |
| |
| if (upper != toLowerCase(upper)) { |
| int count = 0; |
| char ch = 0; |
| |
| do { |
| final char u = toUpperCase(ch); |
| if (u == upper && ch != c) { |
| // Almost all characters will return array of length 1, very few 2 or 3, so growing by one is fine. |
| codes = count == 0 ? new char[1] : Arrays.copyOf(codes, count + 1); |
| codes[count++] = ch; |
| } |
| } while (ch++ < 0xffff); |
| } |
| return codes; |
| } |
| |
| public static void applyAllCaseFold(final int flag, final ApplyCaseFold fun, final Object arg) { |
| for (int c = 0; c < 0xffff; c++) { |
| if (Character.isLowerCase(c)) { |
| final int upper = toUpperCase(c); |
| |
| if (upper != c) { |
| ApplyCaseFold.apply(c, upper, arg); |
| } |
| } |
| } |
| |
| // Some characters have multiple lower case variants, hence we need to do a second run |
| for (int c = 0; c < 0xffff; c++) { |
| if (Character.isLowerCase(c)) { |
| final int upper = toUpperCase(c); |
| |
| if (upper != c) { |
| ApplyCaseFold.apply(upper, c, arg); |
| } |
| } |
| } |
| } |
| |
| public static char toLowerCase(final char c) { |
| return (char)toLowerCase((int)c); |
| } |
| |
| public static int toLowerCase(final int c) { |
| if (c < 128) { |
| return ('A' <= c && c <= 'Z') ? (c + ('a' - 'A')) : c; |
| } |
| // Do not convert non-ASCII upper case character to ASCII lower case. |
| final int lower = Character.toLowerCase(c); |
| return (lower < 128) ? c : lower; |
| |
| } |
| |
| public static char toUpperCase(final char c) { |
| return (char)toUpperCase((int)c); |
| } |
| |
| public static int toUpperCase(final int c) { |
| if (c < 128) { |
| return ('a' <= c && c <= 'z') ? c + ('A' - 'a') : c; |
| } |
| // Do not convert non-ASCII lower case character to ASCII upper case. |
| final int upper = Character.toUpperCase(c); |
| return (upper < 128) ? c : upper; |
| } |
| |
| public static int[] ctypeCodeRange(final int ctype, final IntHolder sbOut) { |
| sbOut.value = 0x100; // use bitset for codes smaller than 256 |
| int[] range = null; |
| |
| if (ctype < codeRanges.length) { |
| range = codeRanges[ctype]; |
| |
| if (range == null) { |
| // format: [numberOfRanges, rangeStart, rangeEnd, ...] |
| range = new int[16]; |
| int rangeCount = 0; |
| int lastCode = -2; |
| |
| for (int code = 0; code <= 0xffff; code++) { |
| if (isCodeCType(code, ctype)) { |
| if (lastCode < code -1) { |
| if (rangeCount * 2 + 2 >= range.length) { |
| range = Arrays.copyOf(range, range.length * 2); |
| } |
| range[rangeCount * 2 + 1] = code; |
| rangeCount++; |
| } |
| range[rangeCount * 2] = lastCode = code; |
| } |
| } |
| |
| if (rangeCount * 2 + 1 < range.length) { |
| range = Arrays.copyOf(range, rangeCount * 2 + 1); |
| } |
| |
| range[0] = rangeCount; |
| codeRanges[ctype] = range; |
| } |
| } |
| |
| return range; |
| } |
| |
| // CodeRange.isInCodeRange |
| public static boolean isInCodeRange(final int[] p, final int offset, final int code) { |
| int low = 0; |
| final int n = p[offset]; |
| int high = n ; |
| |
| while (low < high) { |
| final int x = (low + high) >> 1; |
| if (code > p[(x << 1) + 2 + offset]) { |
| low = x + 1; |
| } else { |
| high = x; |
| } |
| } |
| return low < n && code >= p[(low << 1) + 1 + offset]; |
| } |
| |
| /** |
| * @see <a href="http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt">http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt</a> |
| * |
| * @param code code |
| * @param ctype ctype |
| * |
| * @return isCodeCType |
| */ |
| public static boolean isCodeCType(final int code, final int ctype) { |
| int type; |
| switch (ctype) { |
| case CharacterType.NEWLINE: |
| return isNewLine(code); |
| case CharacterType.ALPHA: |
| return (1 << Character.getType(code) & CharacterType.ALPHA_MASK) != 0; |
| case CharacterType.BLANK: |
| return code == 0x09 || Character.getType(code) == Character.SPACE_SEPARATOR; |
| case CharacterType.CNTRL: |
| type = Character.getType(code); |
| return (1 << type & CharacterType.CNTRL_MASK) != 0 || type == Character.UNASSIGNED; |
| case CharacterType.DIGIT: |
| return EncodingHelper.isDigit(code); |
| case CharacterType.GRAPH: |
| switch (code) { |
| case 0x09: |
| case 0x0a: |
| case 0x0b: |
| case 0x0c: |
| case 0x0d: |
| return false; |
| default: |
| type = Character.getType(code); |
| return (1 << type & CharacterType.GRAPH_MASK) == 0 && type != Character.UNASSIGNED; |
| } |
| case CharacterType.LOWER: |
| return Character.isLowerCase(code); |
| case CharacterType.PRINT: |
| type = Character.getType(code); |
| return (1 << type & CharacterType.PRINT_MASK) == 0 && type != Character.UNASSIGNED; |
| case CharacterType.PUNCT: |
| return (1 << Character.getType(code) & CharacterType.PUNCT_MASK) != 0; |
| case CharacterType.SPACE: |
| // ECMA 7.2 and 7.3 |
| switch (code) { |
| case 0x09: |
| case 0x0a: |
| case 0x0b: |
| case 0x0c: |
| case 0x0d: |
| return true; |
| default: |
| // true if Unicode separator or BOM or U+180E (see JDK-8138758) |
| return (1 << Character.getType(code) & CharacterType.SPACE_MASK) != 0 |
| || code == 0xfeff || code == 0x180e; |
| } |
| case CharacterType.UPPER: |
| return Character.isUpperCase(code); |
| case CharacterType.XDIGIT: |
| return EncodingHelper.isXDigit(code); |
| case CharacterType.WORD: |
| return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0; |
| case CharacterType.ALNUM: |
| return (1 << Character.getType(code) & CharacterType.ALNUM_MASK) != 0; |
| case CharacterType.ASCII: |
| return code < 0x80; |
| default: |
| throw new RuntimeException("illegal character type: " + ctype); |
| } |
| } |
| } |
| |