blob: 616ba812bd9feda48977f15cefab7d097fd18d69 [file] [log] [blame]
* Copyright (C) 2006 Lars Knoll <>
* Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* Library General Public License for more details.
* You should have received a copy of the GNU Library General Public License
* along with this library; see the file COPYING.LIB. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301, USA.
#include "config.h"
#include "TextBreakIterator.h"
#include "LineBreakIteratorPoolICU.h"
#include <wtf/Atomics.h>
#include <wtf/text/WTFString.h>
using namespace WTF;
using namespace std;
namespace WebCore {
static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator,
UBreakIteratorType type, const UChar* string, int length)
if (!string)
return 0;
if (!createdIterator) {
UErrorCode openStatus = U_ZERO_ERROR;
iterator = reinterpret_cast<TextBreakIterator*>(ubrk_open(type, currentTextBreakLocaleID(), 0, 0, &openStatus));
createdIterator = true;
ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
if (!iterator)
return 0;
UErrorCode setTextStatus = U_ZERO_ERROR;
ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus);
if (U_FAILURE(setTextStatus))
return 0;
return iterator;
static const int s_UTextCharacterBufferSize = 16;
typedef struct {
UText uTextStruct;
UChar uCharBuffer[s_UTextCharacterBufferSize + 1];
} UTextWithBuffer;
static UText emptyUText = UTEXT_INITIALIZER;
static UText* uTextLatin1Clone(UText*, const UText*, UBool, UErrorCode*);
static int64_t uTextLatin1NativeLength(UText*);
static UBool uTextLatin1Access(UText*, int64_t, UBool);
static int32_t uTextLatin1Extract(UText*, int64_t, int64_t, UChar*, int32_t, UErrorCode*);
static int64_t uTextLatin1MapOffsetToNative(const UText*);
static int32_t uTextLatin1MapNativeIndexToUTF16(const UText*, int64_t);
static void uTextLatin1Close(UText*);
static struct UTextFuncs uTextLatin1Funcs = {
0, 0, 0,
0, 0, 0
static UText* uTextLatin1Clone(UText* destination, const UText* source, UBool deep, UErrorCode* status)
ASSERT_UNUSED(deep, !deep);
if (U_FAILURE(*status))
return 0;
UText* result = utext_setup(destination, sizeof(UChar) * (s_UTextCharacterBufferSize + 1), status);
if (U_FAILURE(*status))
return destination;
result->providerProperties = source->providerProperties;
/* Point at the same position, but with an empty buffer */
result->chunkNativeStart = source->chunkNativeStart;
result->chunkNativeLimit = source->chunkNativeStart;
result->nativeIndexingLimit = static_cast<int32_t>(source->chunkNativeStart);
result->chunkOffset = 0;
result->context = source->context;
result->a = source->a;
result->pFuncs = &uTextLatin1Funcs;
result->chunkContents = (UChar*)result->pExtra;
memset(const_cast<UChar*>(result->chunkContents), 0, sizeof(UChar) * (s_UTextCharacterBufferSize + 1));
return result;
static int64_t uTextLatin1NativeLength(UText* uText)
return uText->a;
static UBool uTextLatin1Access(UText* uText, int64_t index, UBool forward)
int64_t length = uText->a;
if (forward) {
if (index < uText->chunkNativeLimit && index >= uText->chunkNativeStart) {
/* Already inside the buffer. Set the new offset. */
uText->chunkOffset = (int32_t)(index - uText->chunkNativeStart);
return TRUE;
if (index >= length && uText->chunkNativeLimit == length) {
/* Off the end of the buffer, but we can't get it. */
uText->chunkOffset = uText->chunkLength;
return FALSE;
} else {
if (index <= uText->chunkNativeLimit && index > uText->chunkNativeStart) {
/* Already inside the buffer. Set the new offset. */
uText->chunkOffset = (int32_t)(index - uText->chunkNativeStart);
return TRUE;
if (!index && !uText->chunkNativeStart) {
/* Already at the beginning; can't go any farther */
uText->chunkOffset = 0;
return FALSE;
if (forward) {
uText->chunkNativeStart = index;
uText->chunkNativeLimit = uText->chunkNativeStart + s_UTextCharacterBufferSize;
if (uText->chunkNativeLimit > length)
uText->chunkNativeLimit = length;
uText->chunkOffset = 0;
} else {
uText->chunkNativeLimit = index;
if (uText->chunkNativeLimit > length)
uText->chunkNativeLimit = length;
uText->chunkNativeStart = uText->chunkNativeLimit - s_UTextCharacterBufferSize;
if (uText->chunkNativeStart < 0)
uText->chunkNativeStart = 0;
uText->chunkOffset = uText->chunkLength;
uText->chunkLength = (int32_t) (uText->chunkNativeLimit - uText->chunkNativeStart);
StringImpl::copyChars(const_cast<UChar*>(uText->chunkContents), static_cast<const LChar*>(uText->context) + uText->chunkNativeStart, static_cast<unsigned>(uText->chunkLength));
uText->nativeIndexingLimit = uText->chunkLength;
return TRUE;
static int32_t uTextLatin1Extract(UText* uText, int64_t start, int64_t limit, UChar* dest, int32_t destCapacity, UErrorCode* status)
int64_t length = uText->a;
if (U_FAILURE(*status))
return 0;
if (destCapacity < 0 || (!dest && destCapacity > 0)) {
return 0;
if (start < 0 || start > limit || (limit - start) > INT32_MAX) {
return 0;
if (start > length)
start = length;
if (limit > length)
limit = length;
length = limit - start;
if (!length)
return 0;
if (destCapacity > 0 && !dest) {
int32_t trimmedLength = static_cast<int32_t>(length);
if (trimmedLength > destCapacity)
trimmedLength = destCapacity;
StringImpl::copyChars(dest, static_cast<const LChar*>(uText->context) + start, static_cast<unsigned>(trimmedLength));
if (length < destCapacity) {
dest[length] = 0;
*status = U_ZERO_ERROR;
} else if (length == destCapacity)
return static_cast<int32_t>(length);
static int64_t uTextLatin1MapOffsetToNative(const UText* uText)
return uText->chunkNativeStart + uText->chunkOffset;
static int32_t uTextLatin1MapNativeIndexToUTF16(const UText* uText, int64_t nativeIndex)
ASSERT_UNUSED(uText, uText->chunkNativeStart >= nativeIndex);
ASSERT_UNUSED(uText, nativeIndex < uText->chunkNativeLimit);
return static_cast<int32_t>(nativeIndex);
static void uTextLatin1Close(UText* uText)
uText->context = 0;
static UText* UTextOpenLatin1(UTextWithBuffer* uTextLatin1, const LChar* string, unsigned length, UErrorCode* errorCode)
UText* result = utext_setup(reinterpret_cast<UText*>(uTextLatin1), sizeof(UChar) * (s_UTextCharacterBufferSize + 1), errorCode);
if (!U_SUCCESS(*errorCode))
return 0;
result->context = string;
result->a = (int64_t)length;
result->pFuncs = &uTextLatin1Funcs;
result->chunkContents = (UChar*)result->pExtra;
memset(const_cast<UChar*>(result->chunkContents), 0, sizeof(UChar) * (s_UTextCharacterBufferSize + 1));
return result;
TextBreakIterator* wordBreakIterator(const UChar* string, int length)
static bool createdWordBreakIterator = false;
static TextBreakIterator* staticWordBreakIterator;
return setUpIterator(createdWordBreakIterator,
staticWordBreakIterator, UBRK_WORD, string, length);
TextBreakIterator* acquireLineBreakIterator(const LChar* string, int length, const AtomicString& locale)
UBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
if (!iterator)
return 0;
UTextWithBuffer uTextLatin1Local;
uTextLatin1Local.uTextStruct = emptyUText;
uTextLatin1Local.uTextStruct.extraSize = sizeof(uTextLatin1Local.uCharBuffer);
uTextLatin1Local.uTextStruct.pExtra = uTextLatin1Local.uCharBuffer;
UErrorCode uTextOpenStatus = U_ZERO_ERROR;
UText* uTextLatin1 = UTextOpenLatin1(&uTextLatin1Local, string, length, &uTextOpenStatus);
if (U_FAILURE(uTextOpenStatus)) {
LOG_ERROR("UTextOpenLatin1 failed with status %d", uTextOpenStatus);
return 0;
UErrorCode setTextStatus = U_ZERO_ERROR;
ubrk_setUText(iterator, uTextLatin1, &setTextStatus);
if (U_FAILURE(setTextStatus)) {
LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
return 0;
return reinterpret_cast<TextBreakIterator*>(iterator);
TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length, const AtomicString& locale)
UBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
if (!iterator)
return 0;
UErrorCode setTextStatus = U_ZERO_ERROR;
ubrk_setText(iterator, string, length, &setTextStatus);
if (U_FAILURE(setTextStatus)) {
LOG_ERROR("ubrk_setText failed with status %d", setTextStatus);
return 0;
return reinterpret_cast<TextBreakIterator*>(iterator);
void releaseLineBreakIterator(TextBreakIterator* iterator)
ASSERT_ARG(iterator, iterator);
static TextBreakIterator* nonSharedCharacterBreakIterator;
static inline bool compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator* expected, TextBreakIterator* newValue)
return weakCompareAndSwap(reinterpret_cast<void**>(&nonSharedCharacterBreakIterator), expected, newValue);
DEFINE_STATIC_LOCAL(Mutex, nonSharedCharacterBreakIteratorMutex, ());
MutexLocker locker(nonSharedCharacterBreakIteratorMutex);
if (nonSharedCharacterBreakIterator != expected)
return false;
nonSharedCharacterBreakIterator = newValue;
return true;
NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const UChar* buffer, int length)
m_iterator = nonSharedCharacterBreakIterator;
bool createdIterator = m_iterator && compareAndSwapNonSharedCharacterBreakIterator(m_iterator, 0);
m_iterator = setUpIterator(createdIterator, m_iterator, UBRK_CHARACTER, buffer, length);
if (!compareAndSwapNonSharedCharacterBreakIterator(0, m_iterator))
TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
static bool createdSentenceBreakIterator = false;
static TextBreakIterator* staticSentenceBreakIterator;
return setUpIterator(createdSentenceBreakIterator,
staticSentenceBreakIterator, UBRK_SENTENCE, string, length);
int textBreakFirst(TextBreakIterator* iterator)
return ubrk_first(reinterpret_cast<UBreakIterator*>(iterator));
int textBreakLast(TextBreakIterator* iterator)
return ubrk_last(reinterpret_cast<UBreakIterator*>(iterator));
int textBreakNext(TextBreakIterator* iterator)
return ubrk_next(reinterpret_cast<UBreakIterator*>(iterator));
int textBreakPrevious(TextBreakIterator* iterator)
return ubrk_previous(reinterpret_cast<UBreakIterator*>(iterator));
int textBreakPreceding(TextBreakIterator* iterator, int pos)
return ubrk_preceding(reinterpret_cast<UBreakIterator*>(iterator), pos);
int textBreakFollowing(TextBreakIterator* iterator, int pos)
return ubrk_following(reinterpret_cast<UBreakIterator*>(iterator), pos);
int textBreakCurrent(TextBreakIterator* iterator)
return ubrk_current(reinterpret_cast<UBreakIterator*>(iterator));
bool isTextBreak(TextBreakIterator* iterator, int position)
return ubrk_isBoundary(reinterpret_cast<UBreakIterator*>(iterator), position);
bool isWordTextBreak(TextBreakIterator* iterator)
int ruleStatus = ubrk_getRuleStatus(reinterpret_cast<UBreakIterator*>(iterator));
return ruleStatus != UBRK_WORD_NONE;
static TextBreakIterator* setUpIteratorWithRules(bool& createdIterator, TextBreakIterator*& iterator,
const char* breakRules, const UChar* string, int length)
if (!string)
return 0;
if (!createdIterator) {
UParseError parseStatus;
UErrorCode openStatus = U_ZERO_ERROR;
String rules(breakRules);
iterator = reinterpret_cast<TextBreakIterator*>(ubrk_openRules(rules.characters(), rules.length(), 0, 0, &parseStatus, &openStatus));
createdIterator = true;
ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
if (!iterator)
return 0;
UErrorCode setTextStatus = U_ZERO_ERROR;
ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus);
if (U_FAILURE(setTextStatus))
return 0;
return iterator;
TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
// This rule set is based on character-break iterator rules of ICU 4.0
// <>.
// The major differences from the original ones are listed below:
// * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
// * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
// * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
// * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
// * Added rules for regional indicator symbols.
static const char* kRules =
"$CR = [\\p{Grapheme_Cluster_Break = CR}];"
"$LF = [\\p{Grapheme_Cluster_Break = LF}];"
"$Control = [\\p{Grapheme_Cluster_Break = Control}];"
"$VoiceMarks = [\\uFF9E\\uFF9F];" // Japanese half-width katakana voiced marks
"$Extend = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
"$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
"$L = [\\p{Grapheme_Cluster_Break = L}];"
"$V = [\\p{Grapheme_Cluster_Break = V}];"
"$T = [\\p{Grapheme_Cluster_Break = T}];"
"$LV = [\\p{Grapheme_Cluster_Break = LV}];"
"$LVT = [\\p{Grapheme_Cluster_Break = LVT}];"
"$Hin0 = [\\u0905-\\u0939];" // Devanagari Letter A,...,Ha
"$HinV = \\u094D;" // Devanagari Sign Virama
"$Hin1 = [\\u0915-\\u0939];" // Devanagari Letter Ka,...,Ha
"$Ben0 = [\\u0985-\\u09B9];" // Bengali Letter A,...,Ha
"$BenV = \\u09CD;" // Bengali Sign Virama
"$Ben1 = [\\u0995-\\u09B9];" // Bengali Letter Ka,...,Ha
"$Pan0 = [\\u0A05-\\u0A39];" // Gurmukhi Letter A,...,Ha
"$PanV = \\u0A4D;" // Gurmukhi Sign Virama
"$Pan1 = [\\u0A15-\\u0A39];" // Gurmukhi Letter Ka,...,Ha
"$Guj0 = [\\u0A85-\\u0AB9];" // Gujarati Letter A,...,Ha
"$GujV = \\u0ACD;" // Gujarati Sign Virama
"$Guj1 = [\\u0A95-\\u0AB9];" // Gujarati Letter Ka,...,Ha
"$Ori0 = [\\u0B05-\\u0B39];" // Oriya Letter A,...,Ha
"$OriV = \\u0B4D;" // Oriya Sign Virama
"$Ori1 = [\\u0B15-\\u0B39];" // Oriya Letter Ka,...,Ha
"$Tel0 = [\\u0C05-\\u0C39];" // Telugu Letter A,...,Ha
"$TelV = \\u0C4D;" // Telugu Sign Virama
"$Tel1 = [\\u0C14-\\u0C39];" // Telugu Letter Ka,...,Ha
"$Kan0 = [\\u0C85-\\u0CB9];" // Kannada Letter A,...,Ha
"$KanV = \\u0CCD;" // Kannada Sign Virama
"$Kan1 = [\\u0C95-\\u0CB9];" // Kannada Letter A,...,Ha
"$Mal0 = [\\u0D05-\\u0D39];" // Malayalam Letter A,...,Ha
"$MalV = \\u0D4D;" // Malayalam Sign Virama
"$Mal1 = [\\u0D15-\\u0D39];" // Malayalam Letter A,...,Ha
"$RI = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators
"$CR $LF;"
"$L ($L | $V | $LV | $LVT);"
"($LV | $V) ($V | $T);"
"($LVT | $T) $T;"
"[^$Control $CR $LF] $Extend;"
"[^$Control $CR $LF] $SpacingMark;"
"$RI $RI / $RI;"
"$RI $RI;"
"$Hin0 $HinV $Hin1;" // Devanagari Virama (forward)
"$Ben0 $BenV $Ben1;" // Bengali Virama (forward)
"$Pan0 $PanV $Pan1;" // Gurmukhi Virama (forward)
"$Guj0 $GujV $Guj1;" // Gujarati Virama (forward)
"$Ori0 $OriV $Ori1;" // Oriya Virama (forward)
"$Tel0 $TelV $Tel1;" // Telugu Virama (forward)
"$Kan0 $KanV $Kan1;" // Kannada Virama (forward)
"$Mal0 $MalV $Mal1;" // Malayalam Virama (forward)
"$LF $CR;"
"($L | $V | $LV | $LVT) $L;"
"($V | $T) ($LV | $V);"
"$T ($LVT | $T);"
"$Extend [^$Control $CR $LF];"
"$SpacingMark [^$Control $CR $LF];"
"$RI $RI / $RI $RI;"
"$RI $RI;"
"$Hin1 $HinV $Hin0;" // Devanagari Virama (backward)
"$Ben1 $BenV $Ben0;" // Bengali Virama (backward)
"$Pan1 $PanV $Pan0;" // Gurmukhi Virama (backward)
"$Guj1 $GujV $Guj0;" // Gujarati Virama (backward)
"$Ori1 $OriV $Ori0;" // Gujarati Virama (backward)
"$Tel1 $TelV $Tel0;" // Telugu Virama (backward)
"$Kan1 $KanV $Kan0;" // Kannada Virama (backward)
"$Mal1 $MalV $Mal0;" // Malayalam Virama (backward)
static bool createdCursorMovementIterator = false;
static TextBreakIterator* staticCursorMovementIterator;
return setUpIteratorWithRules(createdCursorMovementIterator, staticCursorMovementIterator, kRules, string, length);