| /* |
| ******************************************************************************* |
| * Copyright (C) 2012-2014, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ******************************************************************************* |
| * collationdata.cpp |
| * |
| * created on: 2012jul28 |
| * created by: Markus W. Scherer |
| */ |
| |
| #include "unicode/utypes.h" |
| |
| #if !UCONFIG_NO_COLLATION |
| |
| #include "unicode/ucol.h" |
| #include "unicode/udata.h" |
| #include "unicode/uscript.h" |
| #include "cmemory.h" |
| #include "collation.h" |
| #include "collationdata.h" |
| #include "uassert.h" |
| #include "utrie2.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| uint32_t |
| CollationData::getIndirectCE32(uint32_t ce32) const { |
| U_ASSERT(Collation::isSpecialCE32(ce32)); |
| int32_t tag = Collation::tagFromCE32(ce32); |
| if(tag == Collation::DIGIT_TAG) { |
| // Fetch the non-numeric-collation CE32. |
| ce32 = ce32s[Collation::indexFromCE32(ce32)]; |
| } else if(tag == Collation::LEAD_SURROGATE_TAG) { |
| ce32 = Collation::UNASSIGNED_CE32; |
| } else if(tag == Collation::U0000_TAG) { |
| // Fetch the normal ce32 for U+0000. |
| ce32 = ce32s[0]; |
| } |
| return ce32; |
| } |
| |
| uint32_t |
| CollationData::getFinalCE32(uint32_t ce32) const { |
| if(Collation::isSpecialCE32(ce32)) { |
| ce32 = getIndirectCE32(ce32); |
| } |
| return ce32; |
| } |
| |
| uint32_t |
| CollationData::getFirstPrimaryForGroup(int32_t script) const { |
| int32_t index = findScript(script); |
| if(index < 0) { |
| return 0; |
| } |
| uint32_t head = scripts[index]; |
| return (head & 0xff00) << 16; |
| } |
| |
| uint32_t |
| CollationData::getLastPrimaryForGroup(int32_t script) const { |
| int32_t index = findScript(script); |
| if(index < 0) { |
| return 0; |
| } |
| uint32_t head = scripts[index]; |
| uint32_t lastByte = head & 0xff; |
| return ((lastByte + 1) << 24) - 1; |
| } |
| |
| int32_t |
| CollationData::getGroupForPrimary(uint32_t p) const { |
| p >>= 24; // Reordering groups are distinguished by primary lead bytes. |
| for(int32_t i = 0; i < scriptsLength; i = i + 2 + scripts[i + 1]) { |
| uint32_t lastByte = scripts[i] & 0xff; |
| if(p <= lastByte) { |
| return scripts[i + 2]; |
| } |
| } |
| return -1; |
| } |
| |
| int32_t |
| CollationData::findScript(int32_t script) const { |
| if(script < 0 || 0xffff < script) { return -1; } |
| for(int32_t i = 0; i < scriptsLength;) { |
| int32_t limit = i + 2 + scripts[i + 1]; |
| for(int32_t j = i + 2; j < limit; ++j) { |
| if(script == scripts[j]) { return i; } |
| } |
| i = limit; |
| } |
| return -1; |
| } |
| |
| int32_t |
| CollationData::getEquivalentScripts(int32_t script, |
| int32_t dest[], int32_t capacity, |
| UErrorCode &errorCode) const { |
| if(U_FAILURE(errorCode)) { return 0; } |
| int32_t i = findScript(script); |
| if(i < 0) { return 0; } |
| int32_t length = scripts[i + 1]; |
| U_ASSERT(length != 0); |
| if(length > capacity) { |
| errorCode = U_BUFFER_OVERFLOW_ERROR; |
| return length; |
| } |
| i += 2; |
| dest[0] = scripts[i++]; |
| for(int32_t j = 1; j < length; ++j) { |
| script = scripts[i++]; |
| // Sorted insertion. |
| for(int32_t k = j;; --k) { |
| // Invariant: dest[k] is free to receive either script or dest[k - 1]. |
| if(k > 0 && script < dest[k - 1]) { |
| dest[k] = dest[k - 1]; |
| } else { |
| dest[k] = script; |
| break; |
| } |
| } |
| } |
| return length; |
| } |
| |
| void |
| CollationData::makeReorderTable(const int32_t *reorder, int32_t length, |
| uint8_t table[256], UErrorCode &errorCode) const { |
| if(U_FAILURE(errorCode)) { return; } |
| |
| // Initialize the table. |
| // Never reorder special low and high primary lead bytes. |
| int32_t lowByte; |
| for(lowByte = 0; lowByte <= Collation::MERGE_SEPARATOR_BYTE; ++lowByte) { |
| table[lowByte] = lowByte; |
| } |
| // lowByte == 03 |
| |
| int32_t highByte; |
| for(highByte = 0xff; highByte >= Collation::TRAIL_WEIGHT_BYTE; --highByte) { |
| table[highByte] = highByte; |
| } |
| // highByte == FE |
| |
| // Set intermediate bytes to 0 to indicate that they have not been set yet. |
| for(int32_t i = lowByte; i <= highByte; ++i) { |
| table[i] = 0; |
| } |
| |
| // Get the set of special reorder codes in the input list. |
| // This supports up to 32 special reorder codes; |
| // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT. |
| uint32_t specials = 0; |
| for(int32_t i = 0; i < length; ++i) { |
| int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST; |
| if(0 <= reorderCode && reorderCode <= 31) { |
| specials |= (uint32_t)1 << reorderCode; |
| } |
| } |
| |
| // Start the reordering with the special low reorder codes that do not occur in the input. |
| for(int32_t i = 0;; i += 3) { |
| if(scripts[i + 1] != 1) { break; } // Went beyond special single-code reorder codes. |
| int32_t reorderCode = (int32_t)scripts[i + 2] - UCOL_REORDER_CODE_FIRST; |
| if(reorderCode < 0) { break; } // Went beyond special reorder codes. |
| if((specials & ((uint32_t)1 << reorderCode)) == 0) { |
| int32_t head = scripts[i]; |
| int32_t firstByte = head >> 8; |
| int32_t lastByte = head & 0xff; |
| do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte); |
| } |
| } |
| |
| // Reorder according to the input scripts, continuing from the bottom of the bytes range. |
| for(int32_t i = 0; i < length;) { |
| int32_t script = reorder[i++]; |
| if(script == USCRIPT_UNKNOWN) { |
| // Put the remaining scripts at the top. |
| while(i < length) { |
| script = reorder[--length]; |
| if(script == USCRIPT_UNKNOWN || // Must occur at most once. |
| script == UCOL_REORDER_CODE_DEFAULT) { |
| errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| int32_t index = findScript(script); |
| if(index < 0) { continue; } |
| int32_t head = scripts[index]; |
| int32_t firstByte = head >> 8; |
| int32_t lastByte = head & 0xff; |
| if(table[firstByte] != 0) { // Duplicate or equivalent script. |
| errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| do { table[lastByte--] = highByte--; } while(firstByte <= lastByte); |
| } |
| break; |
| } |
| if(script == UCOL_REORDER_CODE_DEFAULT) { |
| // The default code must be the only one in the list, and that is handled by the caller. |
| // Otherwise it must not be used. |
| errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| int32_t index = findScript(script); |
| if(index < 0) { continue; } |
| int32_t head = scripts[index]; |
| int32_t firstByte = head >> 8; |
| int32_t lastByte = head & 0xff; |
| if(table[firstByte] != 0) { // Duplicate or equivalent script. |
| errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
| return; |
| } |
| do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte); |
| } |
| |
| // Put all remaining scripts into the middle. |
| // Avoid table[0] which must remain 0. |
| for(int32_t i = 1; i <= 0xff; ++i) { |
| if(table[i] == 0) { table[i] = lowByte++; } |
| } |
| U_ASSERT(lowByte == highByte + 1); |
| } |
| |
| U_NAMESPACE_END |
| |
| #endif // !UCONFIG_NO_COLLATION |