nashorn/src/jdk/nashorn/internal/runtime/regexp/joni/EncodingHelper.java - platform/libcore - Git at Google

 /*
  * Permission is hereby granted, free of charge, to any person obtaining a copy of
  * this software and associated documentation files (the "Software"), to deal in
  * the Software without restriction, including without limitation the rights to
  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  * of the Software, and to permit persons to whom the Software is furnished to do
  * so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in all
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 package jdk.nashorn.internal.runtime.regexp.joni;

 import jdk.nashorn.internal.runtime.regexp.joni.encoding.CharacterType;
 import jdk.nashorn.internal.runtime.regexp.joni.encoding.IntHolder;

 import java.util.Arrays;

 public class EncodingHelper {

     public final static char NEW_LINE = 0xa;
     public final static char RETURN   = 0xd;

     final static char[] EMPTYCHARS = new char[0];
     final static int[][] codeRanges = new int[15][];

     public static int digitVal(int code) {
         return code - '0';
     }

     public static int odigitVal(int code) {
         return digitVal(code);
     }

     public static boolean isXDigit(int code) {
         return Character.isDigit(code) || (code >= 'a' && code <= 'f') || (code >= 'A' && code <= 'F');
     }

     public static int xdigitVal(int code) {
         if (Character.isDigit(code)) {
             return code - '0';
         } else if (code >= 'a' && code <= 'f') {
             return code - 'a' + 10;
         } else {
             return code - 'A' + 10;
         }
     }

     public static boolean isDigit(int code) {
         return code >= '0' && code <= '9';
     }

     public static boolean isWord(int code) {
         // letter, digit, or '_'
         return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
     }

     public static boolean isNewLine(int code) {
         return code == NEW_LINE;
     }

     public static boolean isNewLine(char[] chars, int p, int end) {
         return p < end && chars[p] == NEW_LINE;
     }

     public static boolean isCrnl(char[] chars, int p, int end) {
         return p + 1 < end && chars[p] == RETURN && chars[p + 1] == NEW_LINE;
     }

     // Encoding.prevCharHead
     public static int prevCharHead(int p, int s) {
         return s <= p ? -1 : s - 1;
     }

     /* onigenc_get_right_adjust_char_head_with_prev */
     public static int rightAdjustCharHeadWithPrev(int s, IntHolder prev) {
         if (prev != null) prev.value = -1; /* Sorry */
         return s;
     }

     // Encoding.stepBack
     public static int stepBack(int p, int s, int n) {
        while (s != -1 && n-- > 0) {
            if (s <= p) return -1;
            s--;
        }
        return s;
     }

     /* onigenc_with_ascii_strncmp */
     public static int strNCmp(char[] chars1, int p1, int end, char[] chars2, int p2, int n) {
         while (n-- > 0) {
             if (p1 >= end) return chars2[p2];
             int c = chars1[p1];
             int x = chars2[p2] - c;
             if (x != 0) return x;

             p2++;
             p1++;
         }
         return 0;
     }

     public static int mbcToCode(byte[] bytes, int p, int end) {
         int code = 0;
         for (int i = p; i < end; i++) {
             code = (code << 8) | (bytes[i] & 0xff);
         }
         return code;
     }

     public static int mbcodeStartPosition() {
         return 0x80;
     }

     public static char[] caseFoldCodesByString(int flag, char c) {
         if (Character.isUpperCase(c)) {
             return new char[] {Character.toLowerCase(c)};
         } else if (Character.isLowerCase(c)) {
             return new char[] {Character.toUpperCase(c)};
         } else {
             return EMPTYCHARS;
         }
     }

     public static void applyAllCaseFold(int flag, ApplyCaseFold fun, Object arg) {
         int[] code = new int[1];

         for (int c = 0; c < 0xffff; c++) {
             if (Character.getType(c) == Character.LOWERCASE_LETTER) {

                 int upper = code[0] = Character.toUpperCase(c);
                 fun.apply(c, code, 1, arg);

                 code[0] = c;
                 fun.apply(upper, code, 1, arg);
             }
         }
     }

     // CodeRange.isInCodeRange
     public static boolean isInCodeRange(int[]p, int code) {
         int low = 0;
         int n = p[0];
         int high = n;

         while (low < high) {
             int x = (low + high) >> 1;
             if (code > p[(x << 1) + 2]) {
                 low = x + 1;
             } else {
                 high = x;
             }
         }
         return low < n && code >= p[(low << 1) + 1];
     }

     public static int[] ctypeCodeRange(int ctype, IntHolder sbOut) {
         sbOut.value = 0x100; // use bitset for codes smaller than 256
         int[] range = null;

         if (ctype < codeRanges.length) {
             range = codeRanges[ctype];

             if (range == null) {
                 // format: [numberOfRanges, rangeStart, rangeEnd, ...]
                 range = new int[16];
                 int rangeCount = 0;
                 int lastCode = -2;

                 for (int code = 0; code <= 0xffff; code++) {
                     if (isCodeCType(code, ctype)) {
                         if (lastCode < code -1) {
                             if (rangeCount * 2 + 2 >= range.length) {
                                 range = Arrays.copyOf(range, range.length * 2);
                             }
                             range[rangeCount * 2 + 1] = code;
                             rangeCount++;
                         }
                         range[rangeCount * 2] = lastCode = code;
                     }
                 }

                 if (rangeCount * 2 + 1 < range.length) {
                     range = Arrays.copyOf(range, rangeCount * 2 + 1);
                 }

                 range[0] = rangeCount;
                 codeRanges[ctype] = range;
             }
         }

         return range;
     }

     // CodeRange.isInCodeRange
     public static boolean isInCodeRange(int[]p, int offset, int code) {
         int low = 0;
         int n = p[offset];
         int high = n ;

         while (low < high) {
             int x = (low + high) >> 1;
             if (code > p[(x << 1) + 2 + offset]) {
                 low = x + 1;
             } else {
                 high = x;
             }
         }
         return low < n && code >= p[(low << 1) + 1 + offset];
     }

     /**
      * @see <a href="http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt">http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt</a>
      */
     public static boolean isCodeCType(int code, int ctype) {
         int type;
         switch (ctype) {
             case CharacterType.NEWLINE:
                 return code == EncodingHelper.NEW_LINE;
             case CharacterType.ALPHA:
                 return (1 << Character.getType(code) & CharacterType.ALPHA_MASK) != 0;
             case CharacterType.BLANK:
                 return code == 0x09 || Character.getType(code) == Character.SPACE_SEPARATOR;
             case CharacterType.CNTRL:
                 type = Character.getType(code);
                 return (1 << type & CharacterType.CNTRL_MASK) != 0 || type == Character.UNASSIGNED;
             case CharacterType.DIGIT:
                 return EncodingHelper.isDigit(code);
             case CharacterType.GRAPH:
                 switch (code) {
                     case 0x09:
                     case 0x0a:
                     case 0x0b:
                     case 0x0c:
                     case 0x0d:
                         return false;
                     default:
                         type = Character.getType(code);
                         return (1 << type & CharacterType.GRAPH_MASK) == 0 && type != Character.UNASSIGNED;
                 }
             case CharacterType.LOWER:
                 return Character.isLowerCase(code);
             case CharacterType.PRINT:
                 type = Character.getType(code);
                 return (1 << type & CharacterType.PRINT_MASK) == 0 && type != Character.UNASSIGNED;
             case CharacterType.PUNCT:
                 return (1 << Character.getType(code) & CharacterType.PUNCT_MASK) != 0;
             case CharacterType.SPACE:
                 // ECMA 7.2 and 7.3
                 switch (code) {
                     case 0x09:
                     case 0x0a:
                     case 0x0b:
                     case 0x0c:
                     case 0x0d:
                         return true;
                     default:
                         // true if Unicode separator or BOM
                         return (1 << Character.getType(code) & CharacterType.SPACE_MASK) != 0 || code == 0xfeff;
                 }
             case CharacterType.UPPER:
                 return Character.isUpperCase(code);
             case CharacterType.XDIGIT:
                 return EncodingHelper.isXDigit(code);
             case CharacterType.WORD:
                 return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
             case CharacterType.ALNUM:
                 return (1 << Character.getType(code) & CharacterType.ALNUM_MASK) != 0;
             case CharacterType.ASCII:
                 return code < 0x80;
             default:
                 throw new RuntimeException("illegal character type: " + ctype);
         }
     }
 }
	/*
	* Permission is hereby granted, free of charge, to any person obtaining a copy of
	* this software and associated documentation files (the "Software"), to deal in
	* the Software without restriction, including without limitation the rights to
	* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
	* of the Software, and to permit persons to whom the Software is furnished to do
	* so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in all
	* copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*/
	package jdk.nashorn.internal.runtime.regexp.joni;

	import jdk.nashorn.internal.runtime.regexp.joni.encoding.CharacterType;
	import jdk.nashorn.internal.runtime.regexp.joni.encoding.IntHolder;

	import java.util.Arrays;

	public class EncodingHelper {

	public final static char NEW_LINE = 0xa;
	public final static char RETURN = 0xd;

	final static char[] EMPTYCHARS = new char[0];
	final static int[][] codeRanges = new int[15][];

	public static int digitVal(int code) {
	return code - '0';
	}

	public static int odigitVal(int code) {
	return digitVal(code);
	}

	public static boolean isXDigit(int code) {
	return Character.isDigit(code) \|\| (code >= 'a' && code <= 'f') \|\| (code >= 'A' && code <= 'F');
	}

	public static int xdigitVal(int code) {
	if (Character.isDigit(code)) {
	return code - '0';
	} else if (code >= 'a' && code <= 'f') {
	return code - 'a' + 10;
	} else {
	return code - 'A' + 10;
	}
	}

	public static boolean isDigit(int code) {
	return code >= '0' && code <= '9';
	}

	public static boolean isWord(int code) {
	// letter, digit, or '_'
	return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
	}

	public static boolean isNewLine(int code) {
	return code == NEW_LINE;
	}

	public static boolean isNewLine(char[] chars, int p, int end) {
	return p < end && chars[p] == NEW_LINE;
	}

	public static boolean isCrnl(char[] chars, int p, int end) {
	return p + 1 < end && chars[p] == RETURN && chars[p + 1] == NEW_LINE;
	}

	// Encoding.prevCharHead
	public static int prevCharHead(int p, int s) {
	return s <= p ? -1 : s - 1;
	}

	/* onigenc_get_right_adjust_char_head_with_prev */
	public static int rightAdjustCharHeadWithPrev(int s, IntHolder prev) {
	if (prev != null) prev.value = -1; /* Sorry */
	return s;
	}

	// Encoding.stepBack
	public static int stepBack(int p, int s, int n) {
	while (s != -1 && n-- > 0) {
	if (s <= p) return -1;
	s--;
	}
	return s;
	}

	/* onigenc_with_ascii_strncmp */
	public static int strNCmp(char[] chars1, int p1, int end, char[] chars2, int p2, int n) {
	while (n-- > 0) {
	if (p1 >= end) return chars2[p2];
	int c = chars1[p1];
	int x = chars2[p2] - c;
	if (x != 0) return x;

	p2++;
	p1++;
	}
	return 0;
	}

	public static int mbcToCode(byte[] bytes, int p, int end) {
	int code = 0;
	for (int i = p; i < end; i++) {
	code = (code << 8) \| (bytes[i] & 0xff);
	}
	return code;
	}

	public static int mbcodeStartPosition() {
	return 0x80;
	}

	public static char[] caseFoldCodesByString(int flag, char c) {
	if (Character.isUpperCase(c)) {
	return new char[] {Character.toLowerCase(c)};
	} else if (Character.isLowerCase(c)) {
	return new char[] {Character.toUpperCase(c)};
	} else {
	return EMPTYCHARS;
	}
	}

	public static void applyAllCaseFold(int flag, ApplyCaseFold fun, Object arg) {
	int[] code = new int[1];

	for (int c = 0; c < 0xffff; c++) {
	if (Character.getType(c) == Character.LOWERCASE_LETTER) {

	int upper = code[0] = Character.toUpperCase(c);
	fun.apply(c, code, 1, arg);

	code[0] = c;
	fun.apply(upper, code, 1, arg);
	}
	}
	}

	// CodeRange.isInCodeRange
	public static boolean isInCodeRange(int[]p, int code) {
	int low = 0;
	int n = p[0];
	int high = n;

	while (low < high) {
	int x = (low + high) >> 1;
	if (code > p[(x << 1) + 2]) {
	low = x + 1;
	} else {
	high = x;
	}
	}
	return low < n && code >= p[(low << 1) + 1];
	}

	public static int[] ctypeCodeRange(int ctype, IntHolder sbOut) {
	sbOut.value = 0x100; // use bitset for codes smaller than 256
	int[] range = null;

	if (ctype < codeRanges.length) {
	range = codeRanges[ctype];

	if (range == null) {
	// format: [numberOfRanges, rangeStart, rangeEnd, ...]
	range = new int[16];
	int rangeCount = 0;
	int lastCode = -2;

	for (int code = 0; code <= 0xffff; code++) {
	if (isCodeCType(code, ctype)) {
	if (lastCode < code -1) {
	if (rangeCount * 2 + 2 >= range.length) {
	range = Arrays.copyOf(range, range.length * 2);
	}
	range[rangeCount * 2 + 1] = code;
	rangeCount++;
	}
	range[rangeCount * 2] = lastCode = code;
	}
	}

	if (rangeCount * 2 + 1 < range.length) {
	range = Arrays.copyOf(range, rangeCount * 2 + 1);
	}

	range[0] = rangeCount;
	codeRanges[ctype] = range;
	}
	}

	return range;
	}

	// CodeRange.isInCodeRange
	public static boolean isInCodeRange(int[]p, int offset, int code) {
	int low = 0;
	int n = p[offset];
	int high = n ;

	while (low < high) {
	int x = (low + high) >> 1;
	if (code > p[(x << 1) + 2 + offset]) {
	low = x + 1;
	} else {
	high = x;
	}
	}
	return low < n && code >= p[(low << 1) + 1 + offset];
	}

	/**
	* @see <a href="http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt">http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt</a>
	*/
	public static boolean isCodeCType(int code, int ctype) {
	int type;
	switch (ctype) {
	case CharacterType.NEWLINE:
	return code == EncodingHelper.NEW_LINE;
	case CharacterType.ALPHA:
	return (1 << Character.getType(code) & CharacterType.ALPHA_MASK) != 0;
	case CharacterType.BLANK:
	return code == 0x09 \|\| Character.getType(code) == Character.SPACE_SEPARATOR;
	case CharacterType.CNTRL:
	type = Character.getType(code);
	return (1 << type & CharacterType.CNTRL_MASK) != 0 \|\| type == Character.UNASSIGNED;
	case CharacterType.DIGIT:
	return EncodingHelper.isDigit(code);
	case CharacterType.GRAPH:
	switch (code) {
	case 0x09:
	case 0x0a:
	case 0x0b:
	case 0x0c:
	case 0x0d:
	return false;
	default:
	type = Character.getType(code);
	return (1 << type & CharacterType.GRAPH_MASK) == 0 && type != Character.UNASSIGNED;
	}
	case CharacterType.LOWER:
	return Character.isLowerCase(code);
	case CharacterType.PRINT:
	type = Character.getType(code);
	return (1 << type & CharacterType.PRINT_MASK) == 0 && type != Character.UNASSIGNED;
	case CharacterType.PUNCT:
	return (1 << Character.getType(code) & CharacterType.PUNCT_MASK) != 0;
	case CharacterType.SPACE:
	// ECMA 7.2 and 7.3
	switch (code) {
	case 0x09:
	case 0x0a:
	case 0x0b:
	case 0x0c:
	case 0x0d:
	return true;
	default:
	// true if Unicode separator or BOM
	return (1 << Character.getType(code) & CharacterType.SPACE_MASK) != 0 \|\| code == 0xfeff;
	}
	case CharacterType.UPPER:
	return Character.isUpperCase(code);
	case CharacterType.XDIGIT:
	return EncodingHelper.isXDigit(code);
	case CharacterType.WORD:
	return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
	case CharacterType.ALNUM:
	return (1 << Character.getType(code) & CharacterType.ALNUM_MASK) != 0;
	case CharacterType.ASCII:
	return code < 0x80;
	default:
	throw new RuntimeException("illegal character type: " + ctype);
	}
	}
	}