| /* |
| * Copyright (C) 2006 Lars Knoll <lars@trolltech.com> |
| * Copyright (C) 2007 Apple Inc. All rights reserved. |
| * Copyright (C) 2008 Jürg Billeter <j@bitron.ch> |
| * Copyright (C) 2008 Dominik Röttsches <dominik.roettsches@access-company.com> |
| * Copyright (C) 2010 Igalia S.L. |
| * |
| * This library is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Library General Public |
| * License as published by the Free Software Foundation; either |
| * version 2 of the License, or (at your option) any later version. |
| * |
| * This library is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Library General Public License for more details. |
| * |
| * You should have received a copy of the GNU Library General Public License |
| * along with this library; see the file COPYING.LIB. If not, write to |
| * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
| * Boston, MA 02110-1301, USA. |
| * |
| */ |
| |
| #include "config.h" |
| #include "TextBreakIterator.h" |
| |
| #include <wtf/Atomics.h> |
| #include <wtf/gobject/GOwnPtr.h> |
| #include <pango/pango.h> |
| |
| using namespace WTF; |
| using namespace std; |
| |
| #define UTF8_IS_SURROGATE(character) (character >= 0x10000 && character <= 0x10FFFF) |
| |
| namespace WebCore { |
| |
| class CharacterIterator { |
| public: |
| bool setText(const UChar* string, int length); |
| const gchar* getText() { return m_utf8.get(); } |
| int getLength() { return m_length; } |
| glong getSize() { return m_size; } |
| void setIndex(int index); |
| int getIndex() { return m_index; } |
| void setUTF16Index(int index); |
| int getUTF16Index() { return m_utf16Index; } |
| int getUTF16Length() { return m_utf16Length; } |
| int first(); |
| int last(); |
| int next(); |
| int previous(); |
| private: |
| int characterSize(int index); |
| |
| GOwnPtr<char> m_utf8; |
| int m_length; |
| long m_size; |
| int m_index; |
| int m_utf16Index; |
| int m_utf16Length; |
| }; |
| |
| int CharacterIterator::characterSize(int index) |
| { |
| if (index == m_length || index < 0) |
| return 0; |
| if (m_length == m_utf16Length) |
| return 1; |
| |
| gchar* indexPtr = g_utf8_offset_to_pointer(m_utf8.get(), index); |
| gunichar character = g_utf8_get_char(indexPtr); |
| return UTF8_IS_SURROGATE(character) ? 2 : 1; |
| } |
| |
| bool CharacterIterator::setText(const UChar* string, int length) |
| { |
| long utf8Size = 0; |
| m_utf8.set(g_utf16_to_utf8(string, length, 0, &utf8Size, 0)); |
| if (!utf8Size) |
| return false; |
| |
| m_utf16Length = length; |
| m_length = g_utf8_strlen(m_utf8.get(), utf8Size); |
| m_size = utf8Size; |
| m_index = 0; |
| m_utf16Index = 0; |
| |
| return true; |
| } |
| |
| void CharacterIterator::setIndex(int index) |
| { |
| if (index == m_index) |
| return; |
| if (index <= 0) |
| m_index = m_utf16Index = 0; |
| else if (index >= m_length) { |
| m_index = m_length; |
| m_utf16Index = m_utf16Length; |
| } else if (m_length == m_utf16Length) |
| m_index = m_utf16Index = index; |
| else { |
| m_index = index; |
| int utf16Index = 0; |
| int utf8Index = 0; |
| while (utf8Index < index) { |
| utf16Index += characterSize(utf8Index); |
| utf8Index++; |
| } |
| m_utf16Index = utf16Index; |
| } |
| } |
| |
| void CharacterIterator::setUTF16Index(int index) |
| { |
| if (index == m_utf16Index) |
| return; |
| if (index <= 0) |
| m_utf16Index = m_index = 0; |
| else if (index >= m_utf16Length) { |
| m_utf16Index = m_utf16Length; |
| m_index = m_length; |
| } else if (m_length == m_utf16Length) |
| m_utf16Index = m_index = index; |
| else { |
| m_utf16Index = index; |
| int utf16Index = 0; |
| int utf8Index = 0; |
| while (utf16Index < index) { |
| utf16Index += characterSize(utf8Index); |
| utf8Index++; |
| } |
| m_index = utf8Index; |
| } |
| } |
| |
| int CharacterIterator::first() |
| { |
| m_index = m_utf16Index = 0; |
| return m_index; |
| } |
| |
| int CharacterIterator::last() |
| { |
| m_index = m_length; |
| m_utf16Index = m_utf16Length; |
| return m_index; |
| } |
| |
| int CharacterIterator::next() |
| { |
| int next = m_index + 1; |
| |
| if (next <= m_length) { |
| m_utf16Index = min(m_utf16Index + characterSize(m_index), m_utf16Length); |
| m_index = next; |
| } else { |
| m_index = TextBreakDone; |
| m_utf16Index = TextBreakDone; |
| } |
| |
| return m_index; |
| } |
| |
| int CharacterIterator::previous() |
| { |
| int previous = m_index - 1; |
| |
| if (previous >= 0) { |
| m_utf16Index = max(m_utf16Index - characterSize(previous), 0); |
| m_index = previous; |
| } else { |
| m_index = TextBreakDone; |
| m_utf16Index = TextBreakDone; |
| } |
| |
| return m_index; |
| } |
| |
| enum UBreakIteratorType { |
| UBRK_CHARACTER, |
| UBRK_WORD, |
| UBRK_LINE, |
| UBRK_SENTENCE |
| }; |
| |
| class TextBreakIterator { |
| public: |
| UBreakIteratorType m_type; |
| PangoLogAttr* m_logAttrs; |
| CharacterIterator m_charIterator; |
| }; |
| |
| static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator, |
| UBreakIteratorType type, const UChar* string, int length) |
| { |
| if (!string) |
| return 0; |
| |
| if (!createdIterator) { |
| iterator = new TextBreakIterator(); |
| createdIterator = true; |
| } |
| if (!iterator) |
| return 0; |
| |
| if (!iterator->m_charIterator.setText(string, length)) |
| return 0; |
| |
| int charLength = iterator->m_charIterator.getLength(); |
| |
| iterator->m_type = type; |
| if (createdIterator) |
| g_free(iterator->m_logAttrs); |
| iterator->m_logAttrs = g_new0(PangoLogAttr, charLength + 1); |
| pango_get_log_attrs(iterator->m_charIterator.getText(), iterator->m_charIterator.getSize(), |
| -1, 0, iterator->m_logAttrs, charLength + 1); |
| |
| return iterator; |
| } |
| |
| static TextBreakIterator* nonSharedCharacterBreakIterator; |
| |
| NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const UChar* buffer, int length) |
| { |
| m_iterator = nonSharedCharacterBreakIterator; |
| bool createdIterator = m_iterator && weakCompareAndSwap(reinterpret_cast<void**>(&nonSharedCharacterBreakIterator), m_iterator, 0); |
| m_iterator = setUpIterator(createdIterator, m_iterator, UBRK_CHARACTER, buffer, length); |
| } |
| |
| NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator() |
| { |
| if (!weakCompareAndSwap(reinterpret_cast<void**>(&nonSharedCharacterBreakIterator), 0, m_iterator)) |
| delete m_iterator; |
| } |
| |
| TextBreakIterator* cursorMovementIterator(const UChar* string, int length) |
| { |
| // FIXME: This needs closer inspection to achieve behaviour identical to the ICU version. |
| static bool createdCursorMovementIterator = false; |
| static TextBreakIterator* staticCursorMovementIterator; |
| return setUpIterator(createdCursorMovementIterator, staticCursorMovementIterator, UBRK_CHARACTER, string, length); |
| } |
| |
| TextBreakIterator* wordBreakIterator(const UChar* string, int length) |
| { |
| static bool createdWordBreakIterator = false; |
| static TextBreakIterator* staticWordBreakIterator; |
| return setUpIterator(createdWordBreakIterator, staticWordBreakIterator, UBRK_WORD, string, length); |
| } |
| |
| static bool createdLineBreakIterator = false; |
| static TextBreakIterator* staticLineBreakIterator; |
| |
| TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length, const AtomicString&) |
| { |
| TextBreakIterator* lineBreakIterator = 0; |
| if (!createdLineBreakIterator || staticLineBreakIterator) { |
| setUpIterator(createdLineBreakIterator, staticLineBreakIterator, UBRK_LINE, string, length); |
| swap(staticLineBreakIterator, lineBreakIterator); |
| } |
| |
| if (!lineBreakIterator) { |
| bool createdNewLineBreakIterator = false; |
| setUpIterator(createdNewLineBreakIterator, lineBreakIterator, UBRK_LINE, string, length); |
| } |
| |
| return lineBreakIterator; |
| } |
| |
| void releaseLineBreakIterator(TextBreakIterator* iterator) |
| { |
| ASSERT(createdLineBreakIterator); |
| ASSERT(iterator); |
| |
| if (!staticLineBreakIterator) |
| staticLineBreakIterator = iterator; |
| else |
| delete iterator; |
| } |
| |
| TextBreakIterator* sentenceBreakIterator(const UChar* string, int length) |
| { |
| static bool createdSentenceBreakIterator = false; |
| static TextBreakIterator* staticSentenceBreakIterator; |
| return setUpIterator(createdSentenceBreakIterator, staticSentenceBreakIterator, UBRK_SENTENCE, string, length); |
| } |
| |
| int textBreakFirst(TextBreakIterator* iterator) |
| { |
| iterator->m_charIterator.first(); |
| return iterator->m_charIterator.getUTF16Index(); |
| } |
| |
| int textBreakLast(TextBreakIterator* iterator) |
| { |
| // TextBreakLast is not meant to find just any break according to bi->m_type |
| // but really the one near the last character. |
| // (cmp ICU documentation for ubrk_first and ubrk_last) |
| // From ICU docs for ubrk_last: |
| // "Determine the index immediately beyond the last character in the text being scanned." |
| |
| // So we should advance or traverse back based on bi->m_logAttrs cursor positions. |
| // If last character position in the original string is a whitespace, |
| // traverse to the left until the first non-white character position is found |
| // and return the position of the first white-space char after this one. |
| // Otherwise return m_length, as "the first character beyond the last" is outside our string. |
| |
| bool whiteSpaceAtTheEnd = true; |
| int nextWhiteSpacePos = iterator->m_charIterator.getLength(); |
| |
| int pos = iterator->m_charIterator.last(); |
| while (pos >= 0 && whiteSpaceAtTheEnd) { |
| if (iterator->m_logAttrs[pos].is_cursor_position) { |
| if (whiteSpaceAtTheEnd = iterator->m_logAttrs[pos].is_white) |
| nextWhiteSpacePos = pos; |
| } |
| pos = iterator->m_charIterator.previous(); |
| } |
| iterator->m_charIterator.setIndex(nextWhiteSpacePos); |
| return iterator->m_charIterator.getUTF16Index(); |
| } |
| |
| int textBreakNext(TextBreakIterator* iterator) |
| { |
| while (iterator->m_charIterator.next() != TextBreakDone) { |
| int index = iterator->m_charIterator.getIndex(); |
| |
| // FIXME: UBRK_WORD case: Single multibyte characters (i.e. white space around them), such as the euro symbol €, |
| // are not marked as word_start & word_end as opposed to the way ICU does it. |
| // This leads to - for example - different word selection behaviour when right clicking. |
| |
| if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break) |
| || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end)) |
| || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position) |
| || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) { |
| break; |
| } |
| } |
| return iterator->m_charIterator.getUTF16Index(); |
| } |
| |
| int textBreakPrevious(TextBreakIterator* iterator) |
| { |
| while (iterator->m_charIterator.previous() != TextBreakDone) { |
| int index = iterator->m_charIterator.getIndex(); |
| |
| if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break) |
| || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end)) |
| || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position) |
| || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) { |
| break; |
| } |
| } |
| return iterator->m_charIterator.getUTF16Index(); |
| } |
| |
| int textBreakPreceding(TextBreakIterator* iterator, int offset) |
| { |
| if (offset > iterator->m_charIterator.getUTF16Length()) |
| return TextBreakDone; |
| if (offset < 0) |
| return 0; |
| iterator->m_charIterator.setUTF16Index(offset); |
| return textBreakPrevious(iterator); |
| } |
| |
| int textBreakFollowing(TextBreakIterator* iterator, int offset) |
| { |
| if (offset > iterator->m_charIterator.getUTF16Length()) |
| return TextBreakDone; |
| if (offset < 0) |
| return 0; |
| iterator->m_charIterator.setUTF16Index(offset); |
| return textBreakNext(iterator); |
| } |
| |
| int textBreakCurrent(TextBreakIterator* iterator) |
| { |
| return iterator->m_charIterator.getUTF16Index(); |
| } |
| |
| bool isTextBreak(TextBreakIterator* iterator, int offset) |
| { |
| if (!offset) |
| return true; |
| if (offset > iterator->m_charIterator.getUTF16Length()) |
| return false; |
| |
| iterator->m_charIterator.setUTF16Index(offset); |
| |
| int index = iterator->m_charIterator.getIndex(); |
| iterator->m_charIterator.previous(); |
| textBreakNext(iterator); |
| return iterator->m_charIterator.getIndex() == index; |
| } |
| |
| bool isWordTextBreak(TextBreakIterator*) |
| { |
| return true; |
| } |
| |
| } |