blob: 83849e0e91d99d9974ce5af2f30c3f6911e93785 [file] [log] [blame]
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Iterator;
import java.util.MissingResourceException;
import sun.text.normalizer.UCharacter.HangulSyllableType;
import sun.text.normalizer.UCharacter.NumericType;
/**
* <p>Internal class used for Unicode character property database.</p>
* <p>This classes store binary data read from uprops.icu.
* It does not have the capability to parse the data into more high-level
* information. It only returns bytes of information when required.</p>
* <p>Due to the form most commonly used for retrieval, array of char is used
* to store the binary data.</p>
* <p>UCharacterPropertyDB also contains information on accessing indexes to
* significant points in the binary data.</p>
* <p>Responsibility for molding the binary data into more meaning form lies on
* <a href=UCharacter.html>UCharacter</a>.</p>
* @author Syn Wee Quek
* @since release 2.1, february 1st 2002
*/
final class UCharacterProperty
{
// public data members -----------------------------------------------
/*
* public singleton instance
*/
public static final UCharacterProperty INSTANCE;
/**
* Trie data
*/
public Trie2_16 m_trie_;
/**
* Unicode version
*/
public VersionInfo m_unicodeVersion_;
/**
* Character type mask
*/
public static final int TYPE_MASK = 0x1F;
// uprops.h enum UPropertySource --------------------------------------- ***
/** From uchar.c/uprops.icu main trie */
public static final int SRC_CHAR=1;
/** From uchar.c/uprops.icu properties vectors trie */
public static final int SRC_PROPSVEC=2;
/** From ubidi_props.c/ubidi.icu */
public static final int SRC_BIDI=5;
/** From normalizer2impl.cpp/nfc.nrm */
public static final int SRC_NFC=8;
/** From normalizer2impl.cpp/nfkc.nrm */
public static final int SRC_NFKC=9;
// public methods ----------------------------------------------------
/**
* Gets the main property value for code point ch.
* @param ch code point whose property value is to be retrieved
* @return property value of code point
*/
public final int getProperty(int ch)
{
return m_trie_.get(ch);
}
/**
* Gets the unicode additional properties.
* Java version of C u_getUnicodeProperties().
* @param codepoint codepoint whose additional properties is to be
* retrieved
* @param column The column index.
* @return unicode properties
*/
public int getAdditional(int codepoint, int column) {
assert column >= 0;
if (column >= m_additionalColumnsCount_) {
return 0;
}
return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column];
}
/**
* <p>Get the "age" of the code point.</p>
* <p>The "age" is the Unicode version when the code point was first
* designated (as a non-character or for Private Use) or assigned a
* character.</p>
* <p>This can be useful to avoid emitting code points to receiving
* processes that do not accept newer characters.</p>
* <p>The data is from the UCD file DerivedAge.txt.</p>
* <p>This API does not check the validity of the codepoint.</p>
* @param codepoint The code point.
* @return the Unicode version number
*/
public VersionInfo getAge(int codepoint)
{
int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
return VersionInfo.getInstance(
(version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
version & LAST_NIBBLE_MASK_, 0, 0);
}
// int-value and enumerated properties --------------------------------- ***
public int getType(int c) {
return getProperty(c)&TYPE_MASK;
}
/*
* Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
* Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
*/
private static final int /* UHangulSyllableType */ gcbToHst[]={
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */
HangulSyllableType.LEADING_JAMO, /* U_GCB_L */
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */
HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */
HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */
HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */
HangulSyllableType.VOWEL_JAMO /* U_GCB_V */
/*
* Omit GCB values beyond what we need for hst.
* The code below checks for the array length.
*/
};
private class IntProperty {
int column; // SRC_PROPSVEC column, or "source" if mask==0
int mask;
int shift;
IntProperty(int column, int mask, int shift) {
this.column=column;
this.mask=mask;
this.shift=shift;
}
IntProperty(int source) {
this.column=source;
this.mask=0;
}
int getValue(int c) {
// systematic, directly stored properties
return (getAdditional(c, column)&mask)>>>shift;
}
}
private class BiDiIntProperty extends IntProperty {
BiDiIntProperty() {
super(SRC_BIDI);
}
}
private class CombiningClassIntProperty extends IntProperty {
CombiningClassIntProperty(int source) {
super(source);
}
}
private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties
int which;
int max;
NormQuickCheckIntProperty(int source, int which, int max) {
super(source);
this.which=which;
this.max=max;
}
}
private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE
int getValue(int c) {
return UBiDiProps.INSTANCE.getPairedBracketType(c);
}
};
public int getIntPropertyValue(int c, int which) {
if (which == BIDI_PAIRED_BRACKET_TYPE) {
return intProp.getValue(c);
}
return 0; // undefined
}
/**
* Forms a supplementary code point from the argument character<br>
* Note this is for internal use hence no checks for the validity of the
* surrogate characters are done
* @param lead lead surrogate character
* @param trail trailing surrogate character
* @return code point of the supplementary character
*/
public static int getRawSupplementary(char lead, char trail)
{
return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
}
/**
* Gets the type mask
* @param type character type
* @return mask
*/
public static final int getMask(int type)
{
return 1 << type;
}
/**
* Returns the digit values of characters like 'A' - 'Z', normal,
* half-width and full-width. This method assumes that the other digit
* characters are checked by the calling method.
* @param ch character to test
* @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
* its corresponding digit will be returned.
*/
public static int getEuropeanDigit(int ch) {
if ((ch > 0x7a && ch < 0xff21)
|| ch < 0x41 || (ch > 0x5a && ch < 0x61)
|| ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
return -1;
}
if (ch <= 0x7a) {
// ch >= 0x41 or ch < 0x61
return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
}
// ch >= 0xff21
if (ch <= 0xff3a) {
return ch + 10 - 0xff21;
}
// ch >= 0xff41 && ch <= 0xff5a
return ch + 10 - 0xff41;
}
public int digit(int c) {
int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_;
if(value<=9) {
return value;
} else {
return -1;
}
}
// protected variables -----------------------------------------------
/**
* Extra property trie
*/
Trie2_16 m_additionalTrie_;
/**
* Extra property vectors, 1st column for age and second for binary
* properties.
*/
int m_additionalVectors_[];
/**
* Number of additional columns
*/
int m_additionalColumnsCount_;
/**
* Maximum values for block, bits used as in vector word
* 0
*/
int m_maxBlockScriptValue_;
/**
* Maximum values for script, bits used as in vector word
* 0
*/
int m_maxJTGValue_;
/**
* Script_Extensions data
*/
public char[] m_scriptExtensions_;
// private variables -------------------------------------------------
/**
* Default name of the datafile
*/
private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";
/**
* Shift value for lead surrogate to form a supplementary character.
*/
private static final int LEAD_SURROGATE_SHIFT_ = 10;
/**
* Offset to add to combined surrogate pair to avoid masking.
*/
private static final int SURROGATE_OFFSET_ =
UTF16.SUPPLEMENTARY_MIN_VALUE -
(UTF16.SURROGATE_MIN_VALUE <<
LEAD_SURROGATE_SHIFT_) -
UTF16.TRAIL_SURROGATE_MIN_VALUE;
// property data constants -------------------------------------------------
/**
* Numeric types and values in the main properties words.
*/
private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6;
private static final int getNumericTypeValue(int props) {
return props >> NUMERIC_TYPE_VALUE_SHIFT_;
}
/* constants for the storage form of numeric types and values */
/** No numeric value. */
private static final int NTV_NONE_ = 0;
/** Decimal digits: nv=0..9 */
private static final int NTV_DECIMAL_START_ = 1;
/** Other digits: nv=0..9 */
private static final int NTV_DIGIT_START_ = 11;
/** Small integers: nv=0..154 */
private static final int NTV_NUMERIC_START_ = 21;
private static final int ntvGetType(int ntv) {
return
(ntv==NTV_NONE_) ? NumericType.NONE :
(ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL :
(ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT :
NumericType.NUMERIC;
}
/*
* Properties in vector word 0
* Bits
* 31..24 DerivedAge version major/minor one nibble each
* 23..22 3..1: Bits 7..0 = Script_Extensions index
* 3: Script value from Script_Extensions
* 2: Script=Inherited
* 1: Script=Common
* 0: Script=bits 7..0
* 21..20 reserved
* 19..17 East Asian Width
* 16.. 8 UBlockCode
* 7.. 0 UScriptCode
*/
/**
* Script_Extensions: mask includes Script
*/
public static final int SCRIPT_X_MASK = 0x00c000ff;
//private static final int SCRIPT_X_SHIFT = 22;
/**
* Integer properties mask and shift values for East Asian cell width.
* Equivalent to icu4c UPROPS_EA_MASK
*/
private static final int EAST_ASIAN_MASK_ = 0x000e0000;
/**
* Integer properties mask and shift values for East Asian cell width.
* Equivalent to icu4c UPROPS_EA_SHIFT
*/
private static final int EAST_ASIAN_SHIFT_ = 17;
/**
* Integer properties mask and shift values for blocks.
* Equivalent to icu4c UPROPS_BLOCK_MASK
*/
private static final int BLOCK_MASK_ = 0x0001ff00;
/**
* Integer properties mask and shift values for blocks.
* Equivalent to icu4c UPROPS_BLOCK_SHIFT
*/
private static final int BLOCK_SHIFT_ = 8;
/**
* Integer properties mask and shift values for scripts.
* Equivalent to icu4c UPROPS_SHIFT_MASK
*/
public static final int SCRIPT_MASK_ = 0x000000ff;
/**
* Additional properties used in internal trie data
*/
/*
* Properties in vector word 1
* Each bit encodes one binary property.
* The following constants represent the bit number, use 1<<UPROPS_XYZ.
* UPROPS_BINARY_1_TOP<=32!
*
* Keep this list of property enums in sync with
* propListNames[] in icu/source/tools/genprops/props2.c!
*
* ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
*/
private static final int WHITE_SPACE_PROPERTY_ = 0;
private static final int DASH_PROPERTY_ = 1;
private static final int HYPHEN_PROPERTY_ = 2;
private static final int QUOTATION_MARK_PROPERTY_ = 3;
private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4;
private static final int MATH_PROPERTY_ = 5;
private static final int HEX_DIGIT_PROPERTY_ = 6;
private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7;
private static final int ALPHABETIC_PROPERTY_ = 8;
private static final int IDEOGRAPHIC_PROPERTY_ = 9;
private static final int DIACRITIC_PROPERTY_ = 10;
private static final int EXTENDER_PROPERTY_ = 11;
private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12;
private static final int GRAPHEME_EXTEND_PROPERTY_ = 13;
private static final int GRAPHEME_LINK_PROPERTY_ = 14;
private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15;
private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16;
private static final int RADICAL_PROPERTY_ = 17;
private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18;
private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19;
private static final int DEPRECATED_PROPERTY_ = 20;
private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21;
private static final int XID_START_PROPERTY_ = 22;
private static final int XID_CONTINUE_PROPERTY_ = 23;
private static final int ID_START_PROPERTY_ = 24;
private static final int ID_CONTINUE_PROPERTY_ = 25;
private static final int GRAPHEME_BASE_PROPERTY_ = 26;
private static final int S_TERM_PROPERTY_ = 27;
private static final int VARIATION_SELECTOR_PROPERTY_ = 28;
private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */
private static final int PATTERN_WHITE_SPACE = 30;
/*
* Properties in vector word 2
* Bits
* 31..26 reserved
* 25..20 Line Break
* 19..15 Sentence Break
* 14..10 Word Break
* 9.. 5 Grapheme Cluster Break
* 4.. 0 Decomposition Type
*/
private static final int LB_MASK = 0x03f00000;
private static final int LB_SHIFT = 20;
private static final int SB_MASK = 0x000f8000;
private static final int SB_SHIFT = 15;
private static final int WB_MASK = 0x00007c00;
private static final int WB_SHIFT = 10;
private static final int GCB_MASK = 0x000003e0;
private static final int GCB_SHIFT = 5;
/**
* Integer properties mask for decomposition type.
* Equivalent to icu4c UPROPS_DT_MASK.
*/
private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
/**
* First nibble shift
*/
private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
/**
* Second nibble mask
*/
private static final int LAST_NIBBLE_MASK_ = 0xF;
/**
* Age value shift
*/
private static final int AGE_SHIFT_ = 24;
// private constructors --------------------------------------------------
/**
* Constructor
* @exception IOException thrown when data reading fails or data corrupted
*/
private UCharacterProperty() throws IOException
{
// jar access
ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_);
m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable());
// Read or skip the 16 indexes.
int propertyOffset = bytes.getInt();
/* exceptionOffset = */ bytes.getInt();
/* caseOffset = */ bytes.getInt();
int additionalOffset = bytes.getInt();
int additionalVectorsOffset = bytes.getInt();
m_additionalColumnsCount_ = bytes.getInt();
int scriptExtensionsOffset = bytes.getInt();
int reservedOffset7 = bytes.getInt();
/* reservedOffset8 = */ bytes.getInt();
/* dataTopOffset = */ bytes.getInt();
m_maxBlockScriptValue_ = bytes.getInt();
m_maxJTGValue_ = bytes.getInt();
ICUBinary.skipBytes(bytes, (16 - 12) << 2);
// read the main properties trie
m_trie_ = Trie2_16.createFromSerialized(bytes);
int expectedTrieLength = (propertyOffset - 16) * 4;
int trieLength = m_trie_.getSerializedLength();
if(trieLength > expectedTrieLength) {
throw new IOException("uprops.icu: not enough bytes for main trie");
}
// skip padding after trie bytes
ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
// skip unused intervening data structures
ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4);
if(m_additionalColumnsCount_ > 0) {
// reads the additional property block
m_additionalTrie_ = Trie2_16.createFromSerialized(bytes);
expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4;
trieLength = m_additionalTrie_.getSerializedLength();
if(trieLength > expectedTrieLength) {
throw new IOException("uprops.icu: not enough bytes for additional-properties trie");
}
// skip padding after trie bytes
ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
// additional properties
int size = scriptExtensionsOffset - additionalVectorsOffset;
m_additionalVectors_ = new int[size];
for (int i = 0; i < size; i ++) {
m_additionalVectors_[i] = bytes.getInt();
}
}
// Script_Extensions
int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2;
if(numChars > 0) {
m_scriptExtensions_ = new char[numChars];
for(int i = 0; i < numChars; ++i) {
m_scriptExtensions_[i] = bytes.getChar();
}
}
}
private static final class IsAcceptable implements ICUBinary.Authenticate {
// @Override when we switch to Java 6
public boolean isDataVersionAcceptable(byte version[]) {
return version[0] == 7;
}
}
private static final int DATA_FORMAT = 0x5550726F; // "UPro"
public void upropsvec_addPropertyStarts(UnicodeSet set) {
/* add the start code point of each same-value range of the properties vectors trie */
if(m_additionalColumnsCount_>0) {
/* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator();
Trie2.Range range;
while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
set.add(range.startCodePoint);
}
}
}
// This static initializer block must be placed after
// other static member initialization
static {
try {
INSTANCE = new UCharacterProperty();
}
catch (IOException e) {
throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME_,"");
}
}
// Moved from UProperty.java
/**
* Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3).
* Used in UAX #9: Unicode Bidirectional Algorithm
* (http://www.unicode.org/reports/tr9/)
* Returns UCharacter.BidiPairedBracketType values.
* @stable ICU 52
*/
public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015;
}