/* | |
* Copyright (C) 2007 Apple Computer, Inc. | |
* | |
* Portions are Copyright (C) 1998 Netscape Communications Corporation. | |
* | |
* This library is free software; you can redistribute it and/or | |
* modify it under the terms of the GNU Lesser General Public | |
* License as published by the Free Software Foundation; either | |
* version 2.1 of the License, or (at your option) any later version. | |
* | |
* This library is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
* Lesser General Public License for more details. | |
* | |
* You should have received a copy of the GNU Lesser General Public | |
* License along with this library; if not, write to the Free Software | |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
* | |
* Alternatively, the contents of this file may be used under the terms | |
* of either the Mozilla Public License Version 1.1, found at | |
* http://www.mozilla.org/MPL/ (the "MPL") or the GNU General Public | |
* License Version 2.0, found at http://www.fsf.org/copyleft/gpl.html | |
* (the "GPL"), in which case the provisions of the MPL or the GPL are | |
* applicable instead of those above. If you wish to allow use of your | |
* version of this file only under the terms of one of those two | |
* licenses (the MPL or the GPL) and not to allow others to use your | |
* version of this file under the LGPL, indicate your decision by | |
* deletingthe provisions above and replace them with the notice and | |
* other provisions required by the MPL or the GPL, as the case may be. | |
* If you do not delete the provisions above, a recipient may use your | |
* version of this file under any of the LGPL, the MPL or the GPL. | |
*/ | |
#include "config.h" | |
#include "UnicodeRange.h" | |
namespace WebCore { | |
// This table depends on unicode range definitions. | |
// Each item's index must correspond to a unicode range value | |
// eg. x-cyrillic = LangGroupTable[cRangeCyrillic] | |
static const char* gUnicodeRangeToLangGroupTable[] = | |
{ | |
"x-cyrillic", | |
"el", | |
"tr", | |
"he", | |
"ar", | |
"x-baltic", | |
"th", | |
"ko", | |
"ja", | |
"zh-CN", | |
"zh-TW", | |
"x-devanagari", | |
"x-tamil", | |
"x-armn", | |
"x-beng", | |
"x-cans", | |
"x-ethi", | |
"x-geor", | |
"x-gujr", | |
"x-guru", | |
"x-khmr", | |
"x-mlym" | |
}; | |
/********************************************************************** | |
* Unicode subranges as defined in unicode 3.0 | |
* x-western, x-central-euro, tr, x-baltic -> latin | |
* 0000 - 036f | |
* 1e00 - 1eff | |
* 2000 - 206f (general punctuation) | |
* 20a0 - 20cf (currency symbols) | |
* 2100 - 214f (letterlike symbols) | |
* 2150 - 218f (Number Forms) | |
* el -> greek | |
* 0370 - 03ff | |
* 1f00 - 1fff | |
* x-cyrillic -> cyrillic | |
* 0400 - 04ff | |
* he -> hebrew | |
* 0590 - 05ff | |
* ar -> arabic | |
* 0600 - 06ff | |
* fb50 - fdff (arabic presentation forms) | |
* fe70 - feff (arabic presentation forms b) | |
* th - thai | |
* 0e00 - 0e7f | |
* ko -> korean | |
* ac00 - d7af (hangul Syllables) | |
* 1100 - 11ff (jamo) | |
* 3130 - 318f (hangul compatibility jamo) | |
* ja | |
* 3040 - 309f (hiragana) | |
* 30a0 - 30ff (katakana) | |
* zh-CN | |
* zh-TW | |
* | |
* CJK | |
* 3100 - 312f (bopomofo) | |
* 31a0 - 31bf (bopomofo extended) | |
* 3000 - 303f (CJK Symbols and Punctuation) | |
* 2e80 - 2eff (CJK radicals supplement) | |
* 2f00 - 2fdf (Kangxi Radicals) | |
* 2ff0 - 2fff (Ideographic Description Characters) | |
* 3190 - 319f (kanbun) | |
* 3200 - 32ff (Enclosed CJK letters and Months) | |
* 3300 - 33ff (CJK compatibility) | |
* 3400 - 4dbf (CJK Unified Ideographs Extension A) | |
* 4e00 - 9faf (CJK Unified Ideographs) | |
* f900 - fa5f (CJK Compatibility Ideographs) | |
* fe30 - fe4f (CJK compatibility Forms) | |
* ff00 - ffef (halfwidth and fullwidth forms) | |
* | |
* Armenian | |
* 0530 - 058f | |
* Sriac | |
* 0700 - 074f | |
* Thaana | |
* 0780 - 07bf | |
* Devanagari | |
* 0900 - 097f | |
* Bengali | |
* 0980 - 09ff | |
* Gurmukhi | |
* 0a00 - 0a7f | |
* Gujarati | |
* 0a80 - 0aff | |
* Oriya | |
* 0b00 - 0b7f | |
* Tamil | |
* 0b80 - 0bff | |
* Telugu | |
* 0c00 - 0c7f | |
* Kannada | |
* 0c80 - 0cff | |
* Malayalam | |
* 0d00 - 0d7f | |
* Sinhala | |
* 0d80 - 0def | |
* Lao | |
* 0e80 - 0eff | |
* Tibetan | |
* 0f00 - 0fbf | |
* Myanmar | |
* 1000 - 109f | |
* Georgian | |
* 10a0 - 10ff | |
* Ethiopic | |
* 1200 - 137f | |
* Cherokee | |
* 13a0 - 13ff | |
* Canadian Aboriginal Syllabics | |
* 1400 - 167f | |
* Ogham | |
* 1680 - 169f | |
* Runic | |
* 16a0 - 16ff | |
* Khmer | |
* 1780 - 17ff | |
* Mongolian | |
* 1800 - 18af | |
* Misc - superscripts and subscripts | |
* 2070 - 209f | |
* Misc - Combining Diacritical Marks for Symbols | |
* 20d0 - 20ff | |
* Misc - Arrows | |
* 2190 - 21ff | |
* Misc - Mathematical Operators | |
* 2200 - 22ff | |
* Misc - Miscellaneous Technical | |
* 2300 - 23ff | |
* Misc - Control picture | |
* 2400 - 243f | |
* Misc - Optical character recognition | |
* 2440 - 2450 | |
* Misc - Enclose Alphanumerics | |
* 2460 - 24ff | |
* Misc - Box Drawing | |
* 2500 - 257f | |
* Misc - Block Elements | |
* 2580 - 259f | |
* Misc - Geometric Shapes | |
* 25a0 - 25ff | |
* Misc - Miscellaneous Symbols | |
* 2600 - 267f | |
* Misc - Dingbats | |
* 2700 - 27bf | |
* Misc - Braille Patterns | |
* 2800 - 28ff | |
* Yi Syllables | |
* a000 - a48f | |
* Yi radicals | |
* a490 - a4cf | |
* Alphabetic Presentation Forms | |
* fb00 - fb4f | |
* Misc - Combining half Marks | |
* fe20 - fe2f | |
* Misc - small form variants | |
* fe50 - fe6f | |
* Misc - Specials | |
* fff0 - ffff | |
*********************************************************************/ | |
static const unsigned cNumSubTables = 9; | |
static const unsigned cSubTableSize = 16; | |
static const unsigned char gUnicodeSubrangeTable[cNumSubTables][cSubTableSize] = | |
{ | |
{ // table for X--- | |
cRangeTableBase+1, //u0xxx | |
cRangeTableBase+2, //u1xxx | |
cRangeTableBase+3, //u2xxx | |
cRangeSetCJK, //u3xxx | |
cRangeSetCJK, //u4xxx | |
cRangeSetCJK, //u5xxx | |
cRangeSetCJK, //u6xxx | |
cRangeSetCJK, //u7xxx | |
cRangeSetCJK, //u8xxx | |
cRangeSetCJK, //u9xxx | |
cRangeTableBase+4, //uaxxx | |
cRangeKorean, //ubxxx | |
cRangeKorean, //ucxxx | |
cRangeTableBase+5, //udxxx | |
cRangePrivate, //uexxx | |
cRangeTableBase+6 //ufxxx | |
}, | |
{ //table for 0X-- | |
cRangeSetLatin, //u00xx | |
cRangeSetLatin, //u01xx | |
cRangeSetLatin, //u02xx | |
cRangeGreek, //u03xx XXX 0300-036f is in fact cRangeCombiningDiacriticalMarks | |
cRangeCyrillic, //u04xx | |
cRangeTableBase+7, //u05xx, includes Cyrillic supplement, Hebrew, and Armenian | |
cRangeArabic, //u06xx | |
cRangeTertiaryTable, //u07xx | |
cRangeUnassigned, //u08xx | |
cRangeTertiaryTable, //u09xx | |
cRangeTertiaryTable, //u0axx | |
cRangeTertiaryTable, //u0bxx | |
cRangeTertiaryTable, //u0cxx | |
cRangeTertiaryTable, //u0dxx | |
cRangeTertiaryTable, //u0exx | |
cRangeTibetan, //u0fxx | |
}, | |
{ //table for 1x-- | |
cRangeTertiaryTable, //u10xx | |
cRangeKorean, //u11xx | |
cRangeEthiopic, //u12xx | |
cRangeTertiaryTable, //u13xx | |
cRangeCanadian, //u14xx | |
cRangeCanadian, //u15xx | |
cRangeTertiaryTable, //u16xx | |
cRangeKhmer, //u17xx | |
cRangeMongolian, //u18xx | |
cRangeUnassigned, //u19xx | |
cRangeUnassigned, //u1axx | |
cRangeUnassigned, //u1bxx | |
cRangeUnassigned, //u1cxx | |
cRangeUnassigned, //u1dxx | |
cRangeSetLatin, //u1exx | |
cRangeGreek, //u1fxx | |
}, | |
{ //table for 2x-- | |
cRangeSetLatin, //u20xx | |
cRangeSetLatin, //u21xx | |
cRangeMathOperators, //u22xx | |
cRangeMiscTechnical, //u23xx | |
cRangeControlOpticalEnclose, //u24xx | |
cRangeBoxBlockGeometrics, //u25xx | |
cRangeMiscSymbols, //u26xx | |
cRangeDingbats, //u27xx | |
cRangeBraillePattern, //u28xx | |
cRangeUnassigned, //u29xx | |
cRangeUnassigned, //u2axx | |
cRangeUnassigned, //u2bxx | |
cRangeUnassigned, //u2cxx | |
cRangeUnassigned, //u2dxx | |
cRangeSetCJK, //u2exx | |
cRangeSetCJK, //u2fxx | |
}, | |
{ //table for ax-- | |
cRangeYi, //ua0xx | |
cRangeYi, //ua1xx | |
cRangeYi, //ua2xx | |
cRangeYi, //ua3xx | |
cRangeYi, //ua4xx | |
cRangeUnassigned, //ua5xx | |
cRangeUnassigned, //ua6xx | |
cRangeUnassigned, //ua7xx | |
cRangeUnassigned, //ua8xx | |
cRangeUnassigned, //ua9xx | |
cRangeUnassigned, //uaaxx | |
cRangeUnassigned, //uabxx | |
cRangeKorean, //uacxx | |
cRangeKorean, //uadxx | |
cRangeKorean, //uaexx | |
cRangeKorean, //uafxx | |
}, | |
{ //table for dx-- | |
cRangeKorean, //ud0xx | |
cRangeKorean, //ud1xx | |
cRangeKorean, //ud2xx | |
cRangeKorean, //ud3xx | |
cRangeKorean, //ud4xx | |
cRangeKorean, //ud5xx | |
cRangeKorean, //ud6xx | |
cRangeKorean, //ud7xx | |
cRangeSurrogate, //ud8xx | |
cRangeSurrogate, //ud9xx | |
cRangeSurrogate, //udaxx | |
cRangeSurrogate, //udbxx | |
cRangeSurrogate, //udcxx | |
cRangeSurrogate, //uddxx | |
cRangeSurrogate, //udexx | |
cRangeSurrogate, //udfxx | |
}, | |
{ // table for fx-- | |
cRangePrivate, //uf0xx | |
cRangePrivate, //uf1xx | |
cRangePrivate, //uf2xx | |
cRangePrivate, //uf3xx | |
cRangePrivate, //uf4xx | |
cRangePrivate, //uf5xx | |
cRangePrivate, //uf6xx | |
cRangePrivate, //uf7xx | |
cRangePrivate, //uf8xx | |
cRangeSetCJK, //uf9xx | |
cRangeSetCJK, //ufaxx | |
cRangeArabic, //ufbxx, includes alphabic presentation form | |
cRangeArabic, //ufcxx | |
cRangeArabic, //ufdxx | |
cRangeArabic, //ufexx, includes Combining half marks, | |
// CJK compatibility forms, | |
// CJK compatibility forms, | |
// small form variants | |
cRangeTableBase+8, //uffxx, halfwidth and fullwidth forms, includes Specials | |
}, | |
{ //table for 0x0500 - 0x05ff | |
cRangeCyrillic, //u050x | |
cRangeCyrillic, //u051x | |
cRangeCyrillic, //u052x | |
cRangeArmenian, //u053x | |
cRangeArmenian, //u054x | |
cRangeArmenian, //u055x | |
cRangeArmenian, //u056x | |
cRangeArmenian, //u057x | |
cRangeArmenian, //u058x | |
cRangeHebrew, //u059x | |
cRangeHebrew, //u05ax | |
cRangeHebrew, //u05bx | |
cRangeHebrew, //u05cx | |
cRangeHebrew, //u05dx | |
cRangeHebrew, //u05ex | |
cRangeHebrew, //u05fx | |
}, | |
{ //table for 0xff00 - 0xffff | |
cRangeSetCJK, //uff0x, fullwidth latin | |
cRangeSetCJK, //uff1x, fullwidth latin | |
cRangeSetCJK, //uff2x, fullwidth latin | |
cRangeSetCJK, //uff3x, fullwidth latin | |
cRangeSetCJK, //uff4x, fullwidth latin | |
cRangeSetCJK, //uff5x, fullwidth latin | |
cRangeSetCJK, //uff6x, halfwidth katakana | |
cRangeSetCJK, //uff7x, halfwidth katakana | |
cRangeSetCJK, //uff8x, halfwidth katakana | |
cRangeSetCJK, //uff9x, halfwidth katakana | |
cRangeSetCJK, //uffax, halfwidth hangul jamo | |
cRangeSetCJK, //uffbx, halfwidth hangul jamo | |
cRangeSetCJK, //uffcx, halfwidth hangul jamo | |
cRangeSetCJK, //uffdx, halfwidth hangul jamo | |
cRangeSetCJK, //uffex, fullwidth symbols | |
cRangeSpecials, //ufffx, Specials | |
}, | |
}; | |
// Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80) | |
// code points so that the number of entries in the tertiary range | |
// table for that range is obtained by dividing (0x1700 - 0x0700) by 128. | |
// Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal | |
// syllabaries take multiple chunks and Ogham and Runic share a single chunk. | |
static const unsigned cTertiaryTableSize = ((0x1700 - 0x0700) / 0x80); | |
static const unsigned char gUnicodeTertiaryRangeTable[cTertiaryTableSize] = | |
{ //table for 0x0700 - 0x1600 | |
cRangeSyriac, //u070x | |
cRangeThaana, //u078x | |
cRangeUnassigned, //u080x place holder(resolved in the 2ndary tab.) | |
cRangeUnassigned, //u088x place holder(resolved in the 2ndary tab.) | |
cRangeDevanagari, //u090x | |
cRangeBengali, //u098x | |
cRangeGurmukhi, //u0a0x | |
cRangeGujarati, //u0a8x | |
cRangeOriya, //u0b0x | |
cRangeTamil, //u0b8x | |
cRangeTelugu, //u0c0x | |
cRangeKannada, //u0c8x | |
cRangeMalayalam, //u0d0x | |
cRangeSinhala, //u0d8x | |
cRangeThai, //u0e0x | |
cRangeLao, //u0e8x | |
cRangeTibetan, //u0f0x place holder(resolved in the 2ndary tab.) | |
cRangeTibetan, //u0f8x place holder(resolved in the 2ndary tab.) | |
cRangeMyanmar, //u100x | |
cRangeGeorgian, //u108x | |
cRangeKorean, //u110x place holder(resolved in the 2ndary tab.) | |
cRangeKorean, //u118x place holder(resolved in the 2ndary tab.) | |
cRangeEthiopic, //u120x place holder(resolved in the 2ndary tab.) | |
cRangeEthiopic, //u128x place holder(resolved in the 2ndary tab.) | |
cRangeEthiopic, //u130x | |
cRangeCherokee, //u138x | |
cRangeCanadian, //u140x place holder(resolved in the 2ndary tab.) | |
cRangeCanadian, //u148x place holder(resolved in the 2ndary tab.) | |
cRangeCanadian, //u150x place holder(resolved in the 2ndary tab.) | |
cRangeCanadian, //u158x place holder(resolved in the 2ndary tab.) | |
cRangeCanadian, //u160x | |
cRangeOghamRunic, //u168x this contains two scripts, Ogham & Runic | |
}; | |
// A two level index is almost enough for locating a range, with the | |
// exception of u03xx and u05xx. Since we don't really care about range for | |
// combining diacritical marks in our font application, they are | |
// not discriminated further. Future adoption of this method for other use | |
// should be aware of this limitation. The implementation can be extended if | |
// there is such a need. | |
// For Indic, Southeast Asian scripts and some other scripts between | |
// U+0700 and U+16FF, it's extended to the third level. | |
unsigned int findCharUnicodeRange(UChar32 ch) | |
{ | |
if (ch >= 0xFFFF) | |
return 0; | |
unsigned int range; | |
//search the first table | |
range = gUnicodeSubrangeTable[0][ch >> 12]; | |
if (range < cRangeTableBase) | |
// we try to get a specific range | |
return range; | |
// otherwise, we have one more table to look at | |
range = gUnicodeSubrangeTable[range - cRangeTableBase][(ch & 0x0f00) >> 8]; | |
if (range < cRangeTableBase) | |
return range; | |
if (range < cRangeTertiaryTable) | |
return gUnicodeSubrangeTable[range - cRangeTableBase][(ch & 0x00f0) >> 4]; | |
// Yet another table to look at : U+0700 - U+16FF : 128 code point blocks | |
return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7]; | |
} | |
const char* langGroupFromUnicodeRange(unsigned char unicodeRange) | |
{ | |
if (cRangeSpecificItemNum > unicodeRange) | |
return gUnicodeRangeToLangGroupTable[unicodeRange]; | |
return 0; | |
} | |
} |