blob: 664d1ea02112d043891ff692d663da9bd36bb28a [file] [log] [blame]
/*
* Copyright (C) 2011 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package android.text.method;
import android.text.Selection;
import android.text.SpannableStringBuilder;
import android.icu.text.BreakIterator;
import java.util.Locale;
/**
* Walks through cursor positions at word boundaries. Internally uses
* {@link BreakIterator#getWordInstance()}, and caches {@link CharSequence}
* for performance reasons.
*
* Also provides methods to determine word boundaries.
* {@hide}
*/
public class WordIterator implements Selection.PositionIterator {
// Size of the window for the word iterator, should be greater than the longest word's length
private static final int WINDOW_WIDTH = 50;
private String mString;
private int mOffsetShift;
private BreakIterator mIterator;
/**
* Constructs a WordIterator using the default locale.
*/
public WordIterator() {
this(Locale.getDefault());
}
/**
* Constructs a new WordIterator for the specified locale.
* @param locale The locale to be used when analysing the text.
*/
public WordIterator(Locale locale) {
mIterator = BreakIterator.getWordInstance(locale);
}
public void setCharSequence(CharSequence charSequence, int start, int end) {
mOffsetShift = Math.max(0, start - WINDOW_WIDTH);
final int windowEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH);
if (charSequence instanceof SpannableStringBuilder) {
mString = ((SpannableStringBuilder) charSequence).substring(mOffsetShift, windowEnd);
} else {
mString = charSequence.subSequence(mOffsetShift, windowEnd).toString();
}
mIterator.setText(mString);
}
/** {@inheritDoc} */
public int preceding(int offset) {
int shiftedOffset = offset - mOffsetShift;
do {
shiftedOffset = mIterator.preceding(shiftedOffset);
if (shiftedOffset == BreakIterator.DONE) {
return BreakIterator.DONE;
}
if (isOnLetterOrDigit(shiftedOffset)) {
return shiftedOffset + mOffsetShift;
}
} while (true);
}
/** {@inheritDoc} */
public int following(int offset) {
int shiftedOffset = offset - mOffsetShift;
do {
shiftedOffset = mIterator.following(shiftedOffset);
if (shiftedOffset == BreakIterator.DONE) {
return BreakIterator.DONE;
}
if (isAfterLetterOrDigit(shiftedOffset)) {
return shiftedOffset + mOffsetShift;
}
} while (true);
}
/** {@inheritDoc} */
public boolean isBoundary(int offset) {
int shiftedOffset = offset - mOffsetShift;
checkOffsetIsValid(shiftedOffset);
return mIterator.isBoundary(shiftedOffset);
}
/**
* Returns the position of next boundary after the given offset. Returns
* {@code DONE} if there is no boundary after the given offset.
*
* @param offset the given start position to search from.
* @return the position of the last boundary preceding the given offset.
*/
public int nextBoundary(int offset) {
int shiftedOffset = offset - mOffsetShift;
shiftedOffset = mIterator.following(shiftedOffset);
if (shiftedOffset == BreakIterator.DONE) {
return BreakIterator.DONE;
}
return shiftedOffset + mOffsetShift;
}
/**
* Returns the position of boundary preceding the given offset or
* {@code DONE} if the given offset specifies the starting position.
*
* @param offset the given start position to search from.
* @return the position of the last boundary preceding the given offset.
*/
public int prevBoundary(int offset) {
int shiftedOffset = offset - mOffsetShift;
shiftedOffset = mIterator.preceding(shiftedOffset);
if (shiftedOffset == BreakIterator.DONE) {
return BreakIterator.DONE;
}
return shiftedOffset + mOffsetShift;
}
/** If <code>offset</code> is within a word, returns the index of the first character of that
* word, otherwise returns BreakIterator.DONE.
*
* The offsets that are considered to be part of a word are the indexes of its characters,
* <i>as well as</i> the index of its last character plus one.
* If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
*
* Valid range for offset is [0..textLength] (note the inclusive upper bound).
* The returned value is within [0..offset] or BreakIterator.DONE.
*
* @throws IllegalArgumentException is offset is not valid.
*/
public int getBeginning(int offset) {
// TODO: Check if usage of this can be updated to getBeginning(offset, true) if
// so this method can be removed.
return getBeginning(offset, false);
}
/**
* If <code>offset</code> is within a word, returns the index of the last character of that
* word plus one, otherwise returns BreakIterator.DONE.
*
* The offsets that are considered to be part of a word are the indexes of its characters,
* <i>as well as</i> the index of its last character plus one.
* If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
*
* Valid range for offset is [0..textLength] (note the inclusive upper bound).
* The returned value is within [offset..textLength] or BreakIterator.DONE.
*
* @throws IllegalArgumentException is offset is not valid.
*/
public int getEnd(int offset) {
// TODO: Check if usage of this can be updated to getEnd(offset, true), if
// so this method can be removed.
return getEnd(offset, false);
}
/**
* If the <code>offset</code> is within a word or on a word boundary that can only be
* considered the start of a word (e.g. _word where "_" is any character that would not
* be considered part of the word) then this returns the index of the first character of
* that word.
*
* If the offset is on a word boundary that can be considered the start and end of a
* word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
* between AA and BB, this would return the start of the previous word, AA.
*
* Returns BreakIterator.DONE if there is no previous boundary.
*
* @throws IllegalArgumentException is offset is not valid.
*/
public int getPrevWordBeginningOnTwoWordsBoundary(int offset) {
return getBeginning(offset, true);
}
/**
* If the <code>offset</code> is within a word or on a word boundary that can only be
* considered the end of a word (e.g. word_ where "_" is any character that would not
* be considered part of the word) then this returns the index of the last character
* plus one of that word.
*
* If the offset is on a word boundary that can be considered the start and end of a
* word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
* between AA and BB, this would return the end of the next word, BB.
*
* Returns BreakIterator.DONE if there is no next boundary.
*
* @throws IllegalArgumentException is offset is not valid.
*/
public int getNextWordEndOnTwoWordBoundary(int offset) {
return getEnd(offset, true);
}
/**
* If the <code>offset</code> is within a word or on a word boundary that can only be
* considered the start of a word (e.g. _word where "_" is any character that would not
* be considered part of the word) then this returns the index of the first character of
* that word.
*
* If the offset is on a word boundary that can be considered the start and end of a
* word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
* between AA and BB, and getPrevWordBeginningOnTwoWordsBoundary is true then this would
* return the start of the previous word, AA. Otherwise it would return the current offset,
* the start of BB.
*
* Returns BreakIterator.DONE if there is no previous boundary.
*
* @throws IllegalArgumentException is offset is not valid.
*/
private int getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary) {
final int shiftedOffset = offset - mOffsetShift;
checkOffsetIsValid(shiftedOffset);
if (isOnLetterOrDigit(shiftedOffset)) {
if (mIterator.isBoundary(shiftedOffset)
&& (!isAfterLetterOrDigit(shiftedOffset)
|| !getPrevWordBeginningOnTwoWordsBoundary)) {
return shiftedOffset + mOffsetShift;
} else {
return mIterator.preceding(shiftedOffset) + mOffsetShift;
}
} else {
if (isAfterLetterOrDigit(shiftedOffset)) {
return mIterator.preceding(shiftedOffset) + mOffsetShift;
}
}
return BreakIterator.DONE;
}
/**
* If the <code>offset</code> is within a word or on a word boundary that can only be
* considered the end of a word (e.g. word_ where "_" is any character that would not be
* considered part of the word) then this returns the index of the last character plus one
* of that word.
*
* If the offset is on a word boundary that can be considered the start and end of a
* word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
* between AA and BB, and getNextWordEndOnTwoWordBoundary is true then this would return
* the end of the next word, BB. Otherwise it would return the current offset, the end
* of AA.
*
* Returns BreakIterator.DONE if there is no next boundary.
*
* @throws IllegalArgumentException is offset is not valid.
*/
private int getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary) {
final int shiftedOffset = offset - mOffsetShift;
checkOffsetIsValid(shiftedOffset);
if (isAfterLetterOrDigit(shiftedOffset)) {
if (mIterator.isBoundary(shiftedOffset)
&& (!isOnLetterOrDigit(shiftedOffset) || !getNextWordEndOnTwoWordBoundary)) {
return shiftedOffset + mOffsetShift;
} else {
return mIterator.following(shiftedOffset) + mOffsetShift;
}
} else {
if (isOnLetterOrDigit(shiftedOffset)) {
return mIterator.following(shiftedOffset) + mOffsetShift;
}
}
return BreakIterator.DONE;
}
/**
* If <code>offset</code> is within a group of punctuation as defined
* by {@link #isPunctuation(int)}, returns the index of the first character
* of that group, otherwise returns BreakIterator.DONE.
*
* @param offset the offset to search from.
*/
public int getPunctuationBeginning(int offset) {
while (offset != BreakIterator.DONE && !isPunctuationStartBoundary(offset)) {
offset = prevBoundary(offset);
}
// No need to shift offset, prevBoundary handles that.
return offset;
}
/**
* If <code>offset</code> is within a group of punctuation as defined
* by {@link #isPunctuation(int)}, returns the index of the last character
* of that group plus one, otherwise returns BreakIterator.DONE.
*
* @param offset the offset to search from.
*/
public int getPunctuationEnd(int offset) {
while (offset != BreakIterator.DONE && !isPunctuationEndBoundary(offset)) {
offset = nextBoundary(offset);
}
// No need to shift offset, nextBoundary handles that.
return offset;
}
/**
* Indicates if the provided offset is after a punctuation character
* as defined by {@link #isPunctuation(int)}.
*
* @param offset the offset to check from.
* @return Whether the offset is after a punctuation character.
*/
public boolean isAfterPunctuation(int offset) {
final int shiftedOffset = offset - mOffsetShift;
if (shiftedOffset >= 1 && shiftedOffset <= mString.length()) {
final int codePoint = mString.codePointBefore(shiftedOffset);
return isPunctuation(codePoint);
}
return false;
}
/**
* Indicates if the provided offset is at a punctuation character
* as defined by {@link #isPunctuation(int)}.
*
* @param offset the offset to check from.
* @return Whether the offset is at a punctuation character.
*/
public boolean isOnPunctuation(int offset) {
final int shiftedOffset = offset - mOffsetShift;
if (shiftedOffset >= 0 && shiftedOffset < mString.length()) {
final int codePoint = mString.codePointAt(shiftedOffset);
return isPunctuation(codePoint);
}
return false;
}
private boolean isPunctuationStartBoundary(int offset) {
return isOnPunctuation(offset) && !isAfterPunctuation(offset);
}
private boolean isPunctuationEndBoundary(int offset) {
return !isOnPunctuation(offset) && isAfterPunctuation(offset);
}
private boolean isPunctuation(int cp) {
int type = Character.getType(cp);
return (type == Character.CONNECTOR_PUNCTUATION ||
type == Character.DASH_PUNCTUATION ||
type == Character.END_PUNCTUATION ||
type == Character.FINAL_QUOTE_PUNCTUATION ||
type == Character.INITIAL_QUOTE_PUNCTUATION ||
type == Character.OTHER_PUNCTUATION ||
type == Character.START_PUNCTUATION);
}
private boolean isAfterLetterOrDigit(int shiftedOffset) {
if (shiftedOffset >= 1 && shiftedOffset <= mString.length()) {
final int codePoint = mString.codePointBefore(shiftedOffset);
if (Character.isLetterOrDigit(codePoint)) return true;
}
return false;
}
private boolean isOnLetterOrDigit(int shiftedOffset) {
if (shiftedOffset >= 0 && shiftedOffset < mString.length()) {
final int codePoint = mString.codePointAt(shiftedOffset);
if (Character.isLetterOrDigit(codePoint)) return true;
}
return false;
}
private void checkOffsetIsValid(int shiftedOffset) {
if (shiftedOffset < 0 || shiftedOffset > mString.length()) {
throw new IllegalArgumentException("Invalid offset: " + (shiftedOffset + mOffsetShift) +
". Valid range is [" + mOffsetShift + ", " + (mString.length() + mOffsetShift) +
"]");
}
}
}