blob: 5d8978d66f1f1f82c471aea386c89450573a2801 [file] [log] [blame]
/*
* Copyright (C) 2014 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdint.h>
#include <unicode/uchar.h>
#include <unicode/utf16.h>
#include <minikin/GraphemeBreak.h>
namespace android {
bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t count,
size_t offset) {
// This implementation closely follows Unicode Standard Annex #29 on
// Unicode Text Segmentation (http://www.unicode.org/reports/tr29/),
// implementing a tailored version of extended grapheme clusters.
// The GB rules refer to section 3.1.1, Grapheme Cluster Boundary Rules.
// Rule GB1, sot /; Rule GB2, / eot
if (offset <= start || offset >= start + count) {
return true;
}
if (U16_IS_TRAIL(buf[offset])) {
// Don't break a surrogate pair
return false;
}
uint32_t c1 = 0;
uint32_t c2 = 0;
size_t offset_back = offset;
U16_PREV(buf, start, offset_back, c1);
U16_NEXT(buf, offset, count, c2);
int32_t p1 = u_getIntPropertyValue(c1, UCHAR_GRAPHEME_CLUSTER_BREAK);
int32_t p2 = u_getIntPropertyValue(c2, UCHAR_GRAPHEME_CLUSTER_BREAK);
// Rule GB3, CR x LF
if (p1 == U_GCB_CR && p2 == U_GCB_LF) {
return false;
}
// Rule GB4, (Control | CR | LF) /
if (p1 == U_GCB_CONTROL || p1 == U_GCB_CR || p1 == U_GCB_LF) {
return true;
}
// Rule GB5, / (Control | CR | LF)
if (p2 == U_GCB_CONTROL || p2 == U_GCB_CR || p2 == U_GCB_LF) {
// exclude zero-width control characters from breaking (tailoring of TR29)
if (c2 == 0x00ad
|| (c2 >= 0x200b && c2 <= 0x200f)
|| (c2 >= 0x2028 && c2 <= 0x202e)
|| (c2 >= 0x2060 && c2 <= 0x206f)) {
return false;
}
return true;
}
// Rule GB6, L x ( L | V | LV | LVT )
if (p1 == U_GCB_L && (p2 == U_GCB_L || p2 == U_GCB_V || p2 == U_GCB_LV || p2 == U_GCB_LVT)) {
return false;
}
// Rule GB7, ( LV | V ) x ( V | T )
if ((p1 == U_GCB_LV || p1 == U_GCB_V) && (p2 == U_GCB_V || p2 == U_GCB_T)) {
return false;
}
// Rule GB8, ( LVT | T ) x T
if ((p1 == U_GCB_L || p1 == U_GCB_T) && p2 == U_GCB_T) {
return false;
}
// Rule GB8a, Regional_Indicator x Regional_Indicator
if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) {
return false;
}
// Rule GB9, x Extend; Rule GB9a, x SpacingMark
if (p2 == U_GCB_EXTEND || p2 == U_GCB_SPACING_MARK) {
if (c2 == 0xe33) {
// most other implementations break THAI CHARACTER SARA AM
// (tailoring of TR29)
return true;
}
return false;
}
// Cluster indic syllables togeter (tailoring of TR29)
if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9 // virama
&& u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) {
return false;
}
// Rule GB10, Any / Any
return true;
}
size_t GraphemeBreak::getTextRunCursor(const uint16_t* buf, size_t start, size_t count,
size_t offset, MoveOpt opt) {
switch (opt) {
case AFTER:
if (offset < start + count) {
offset++;
}
// fall through
case AT_OR_AFTER:
while (!isGraphemeBreak(buf, start, count, offset)) {
offset++;
}
break;
case BEFORE:
if (offset > start) {
offset--;
}
// fall through
case AT_OR_BEFORE:
while (!isGraphemeBreak(buf, start, count, offset)) {
offset--;
}
break;
case AT:
if (!isGraphemeBreak(buf, start, count, offset)) {
offset = (size_t)-1;
}
break;
}
return offset;
}
} // namespace android