| /* |
| * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. |
| * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| * |
| * This code is free software; you can redistribute it and/or modify it |
| * under the terms of the GNU General Public License version 2 only, as |
| * published by the Free Software Foundation. Oracle designates this |
| * particular file as subject to the "Classpath" exception as provided |
| * by Oracle in the LICENSE file that accompanied this code. |
| * |
| * This code is distributed in the hope that it will be useful, but WITHOUT |
| * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| * version 2 for more details (a copy is included in the LICENSE file that |
| * accompanied this code). |
| * |
| * You should have received a copy of the GNU General Public License version |
| * 2 along with this work; if not, write to the Free Software Foundation, |
| * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| * |
| * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
| * or visit www.oracle.com if you need additional information or have any |
| * questions. |
| */ |
| |
| /* |
| ******************************************************************************* |
| * Copyright (C) 2000-2014, International Business Machines Corporation and |
| * others. All Rights Reserved. |
| ******************************************************************************* |
| */ |
| package sun.text.normalizer; |
| |
| import java.text.CharacterIterator; |
| import java.text.Normalizer; |
| |
| /** |
| * Unicode Normalization |
| * |
| * <h2>Unicode normalization API</h2> |
| * |
| * <code>normalize</code> transforms Unicode text into an equivalent composed or |
| * decomposed form, allowing for easier sorting and searching of text. |
| * <code>normalize</code> supports the standard normalization forms described in |
| * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode"> |
| * Unicode Standard Annex #15 — Unicode Normalization Forms</a>. |
| * |
| * Characters with accents or other adornments can be encoded in |
| * several different ways in Unicode. For example, take the character A-acute. |
| * In Unicode, this can be encoded as a single character (the |
| * "composed" form): |
| * |
| * <pre> |
| * 00C1 LATIN CAPITAL LETTER A WITH ACUTE |
| * </pre> |
| * |
| * or as two separate characters (the "decomposed" form): |
| * |
| * <pre> |
| * 0041 LATIN CAPITAL LETTER A |
| * 0301 COMBINING ACUTE ACCENT |
| * </pre> |
| * |
| * To a user of your program, however, both of these sequences should be |
| * treated as the same "user-level" character "A with acute accent". When you |
| * are searching or comparing text, you must ensure that these two sequences are |
| * treated equivalently. In addition, you must handle characters with more than |
| * one accent. Sometimes the order of a character's combining accents is |
| * significant, while in other cases accent sequences in different orders are |
| * really equivalent. |
| * |
| * Similarly, the string "ffi" can be encoded as three separate letters: |
| * |
| * <pre> |
| * 0066 LATIN SMALL LETTER F |
| * 0066 LATIN SMALL LETTER F |
| * 0069 LATIN SMALL LETTER I |
| * </pre> |
| * |
| * or as the single character |
| * |
| * <pre> |
| * FB03 LATIN SMALL LIGATURE FFI |
| * </pre> |
| * |
| * The ffi ligature is not a distinct semantic character, and strictly speaking |
| * it shouldn't be in Unicode at all, but it was included for compatibility |
| * with existing character sets that already provided it. The Unicode standard |
| * identifies such characters by giving them "compatibility" decompositions |
| * into the corresponding semantic characters. When sorting and searching, you |
| * will often want to use these mappings. |
| * |
| * <code>normalize</code> helps solve these problems by transforming text into |
| * the canonical composed and decomposed forms as shown in the first example |
| * above. In addition, you can have it perform compatibility decompositions so |
| * that you can treat compatibility characters the same as their equivalents. |
| * Finally, <code>normalize</code> rearranges accents into the proper canonical |
| * order, so that you do not have to worry about accent rearrangement on your |
| * own. |
| * |
| * Form FCD, "Fast C or D", is also designed for collation. |
| * It allows to work on strings that are not necessarily normalized |
| * with an algorithm (like in collation) that works under "canonical closure", |
| * i.e., it treats precomposed characters and their decomposed equivalents the |
| * same. |
| * |
| * It is not a normalization form because it does not provide for uniqueness of |
| * representation. Multiple strings may be canonically equivalent (their NFDs |
| * are identical) and may all conform to FCD without being identical themselves. |
| * |
| * The form is defined such that the "raw decomposition", the recursive |
| * canonical decomposition of each character, results in a string that is |
| * canonically ordered. This means that precomposed characters are allowed for |
| * as long as their decompositions do not need canonical reordering. |
| * |
| * Its advantage for a process like collation is that all NFD and most NFC texts |
| * - and many unnormalized texts - already conform to FCD and do not need to be |
| * normalized (NFD) for such a process. The FCD quick check will return YES for |
| * most strings in practice. |
| * |
| * normalize(FCD) may be implemented with NFD. |
| * |
| * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications): |
| * http://www.unicode.org/notes/tn5/#FCD |
| * |
| * ICU collation performs either NFD or FCD normalization automatically if |
| * normalization is turned on for the collator object. Beyond collation and |
| * string search, normalized strings may be useful for string equivalence |
| * comparisons, transliteration/transcription, unique representations, etc. |
| * |
| * The W3C generally recommends to exchange texts in NFC. |
| * Note also that most legacy character encodings use only precomposed forms and |
| * often do not encode any combining marks by themselves. For conversion to such |
| * character encodings the Unicode text needs to be normalized to NFC. |
| * For more usage examples, see the Unicode Standard Annex. |
| * |
| * Note: The Normalizer class also provides API for iterative normalization. |
| * While the setIndex() and getIndex() refer to indices in the |
| * underlying Unicode input text, the next() and previous() methods |
| * iterate through characters in the normalized output. |
| * This means that there is not necessarily a one-to-one correspondence |
| * between characters returned by next() and previous() and the indices |
| * passed to and returned from setIndex() and getIndex(). |
| * It is for this reason that Normalizer does not implement the CharacterIterator interface. |
| * |
| * @stable ICU 2.8 |
| */ |
| // Original filename in ICU4J: Normalizer.java |
| public final class NormalizerBase implements Cloneable { |
| |
| // The input text and our position in it |
| private UCharacterIterator text; |
| private Normalizer2 norm2; |
| private Mode mode; |
| private int options; |
| |
| // The normalization buffer is the result of normalization |
| // of the source in [currentIndex..nextIndex] . |
| private int currentIndex; |
| private int nextIndex; |
| |
| // A buffer for holding intermediate results |
| private StringBuilder buffer; |
| private int bufferPos; |
| |
| // Helper classes to defer loading of normalization data. |
| private static final class ModeImpl { |
| private ModeImpl(Normalizer2 n2) { |
| normalizer2 = n2; |
| } |
| private final Normalizer2 normalizer2; |
| } |
| |
| private static final class NFDModeImpl { |
| private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance()); |
| } |
| |
| private static final class NFKDModeImpl { |
| private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance()); |
| } |
| |
| private static final class NFCModeImpl { |
| private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance()); |
| } |
| |
| private static final class NFKCModeImpl { |
| private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance()); |
| } |
| |
| private static final class Unicode32 { |
| private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze(); |
| } |
| |
| private static final class NFD32ModeImpl { |
| private static final ModeImpl INSTANCE = |
| new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(), |
| Unicode32.INSTANCE)); |
| } |
| |
| private static final class NFKD32ModeImpl { |
| private static final ModeImpl INSTANCE = |
| new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(), |
| Unicode32.INSTANCE)); |
| } |
| |
| private static final class NFC32ModeImpl { |
| private static final ModeImpl INSTANCE = |
| new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(), |
| Unicode32.INSTANCE)); |
| } |
| |
| private static final class NFKC32ModeImpl { |
| private static final ModeImpl INSTANCE = |
| new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(), |
| Unicode32.INSTANCE)); |
| } |
| |
| /** |
| * Options bit set value to select Unicode 3.2 normalization |
| * (except NormalizationCorrections). |
| * At most one Unicode version can be selected at a time. |
| * @stable ICU 2.6 |
| */ |
| public static final int UNICODE_3_2=0x20; |
| |
| public static final int UNICODE_3_2_0_ORIGINAL=UNICODE_3_2; |
| |
| /* |
| * Default option for the latest Unicode normalization. This option is |
| * provided mainly for testing. |
| * The value zero means that normalization is done with the fixes for |
| * - Corrigendum 4 (Five CJK Canonical Mapping Errors) |
| * - Corrigendum 5 (Normalization Idempotency) |
| */ |
| public static final int UNICODE_LATEST = 0x00; |
| |
| /** |
| * Constant indicating that the end of the iteration has been reached. |
| * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}. |
| * @stable ICU 2.8 |
| */ |
| public static final int DONE = UCharacterIterator.DONE; |
| |
| /** |
| * Constants for normalization modes. |
| * <p> |
| * The Mode class is not intended for public subclassing. |
| * Only the Mode constants provided by the Normalizer class should be used, |
| * and any fields or methods should not be called or overridden by users. |
| * @stable ICU 2.8 |
| */ |
| public abstract static class Mode { |
| |
| /** |
| * Sole constructor |
| * @internal |
| * @deprecated This API is ICU internal only. |
| */ |
| @Deprecated |
| protected Mode() { |
| } |
| |
| /** |
| * @internal |
| * @deprecated This API is ICU internal only. |
| */ |
| @Deprecated |
| protected abstract Normalizer2 getNormalizer2(int options); |
| } |
| |
| private static Mode toMode(Normalizer.Form form) { |
| switch (form) { |
| case NFC : |
| return NFC; |
| case NFD : |
| return NFD; |
| case NFKC : |
| return NFKC; |
| case NFKD : |
| return NFKD; |
| } |
| |
| throw new IllegalArgumentException("Unexpected normalization form: " + |
| form); |
| } |
| |
| private static final class NONEMode extends Mode { |
| protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; } |
| } |
| |
| private static final class NFDMode extends Mode { |
| protected Normalizer2 getNormalizer2(int options) { |
| return (options&UNICODE_3_2) != 0 ? |
| NFD32ModeImpl.INSTANCE.normalizer2 : |
| NFDModeImpl.INSTANCE.normalizer2; |
| } |
| } |
| |
| private static final class NFKDMode extends Mode { |
| protected Normalizer2 getNormalizer2(int options) { |
| return (options&UNICODE_3_2) != 0 ? |
| NFKD32ModeImpl.INSTANCE.normalizer2 : |
| NFKDModeImpl.INSTANCE.normalizer2; |
| } |
| } |
| |
| private static final class NFCMode extends Mode { |
| protected Normalizer2 getNormalizer2(int options) { |
| return (options&UNICODE_3_2) != 0 ? |
| NFC32ModeImpl.INSTANCE.normalizer2 : |
| NFCModeImpl.INSTANCE.normalizer2; |
| } |
| } |
| |
| private static final class NFKCMode extends Mode { |
| protected Normalizer2 getNormalizer2(int options) { |
| return (options&UNICODE_3_2) != 0 ? |
| NFKC32ModeImpl.INSTANCE.normalizer2 : |
| NFKCModeImpl.INSTANCE.normalizer2; |
| } |
| } |
| |
| /** |
| * No decomposition/composition. |
| * @stable ICU 2.8 |
| */ |
| public static final Mode NONE = new NONEMode(); |
| |
| /** |
| * Canonical decomposition. |
| * @stable ICU 2.8 |
| */ |
| public static final Mode NFD = new NFDMode(); |
| |
| /** |
| * Compatibility decomposition. |
| * @stable ICU 2.8 |
| */ |
| public static final Mode NFKD = new NFKDMode(); |
| |
| /** |
| * Canonical decomposition followed by canonical composition. |
| * @stable ICU 2.8 |
| */ |
| public static final Mode NFC = new NFCMode(); |
| |
| public static final Mode NFKC =new NFKCMode(); |
| |
| //------------------------------------------------------------------------- |
| // Iterator constructors |
| //------------------------------------------------------------------------- |
| |
| /** |
| * Creates a new {@code NormalizerBase} object for iterating over the |
| * normalized form of a given string. |
| * <p> |
| * The {@code options} parameter specifies which optional |
| * {@code NormalizerBase} features are to be enabled for this object. |
| * <p> |
| * @param str The string to be normalized. The normalization |
| * will start at the beginning of the string. |
| * |
| * @param mode The normalization mode. |
| * |
| * @param opt Any optional features to be enabled. |
| * Currently the only available option is {@link #UNICODE_3_2}. |
| * If you want the default behavior corresponding to one of the |
| * standard Unicode Normalization Forms, use 0 for this argument. |
| * @stable ICU 2.6 |
| */ |
| public NormalizerBase(String str, Mode mode, int opt) { |
| this.text = UCharacterIterator.getInstance(str); |
| this.mode = mode; |
| this.options=opt; |
| norm2 = mode.getNormalizer2(opt); |
| buffer = new StringBuilder(); |
| } |
| |
| public NormalizerBase(String str, Mode mode) { |
| this(str, mode, 0); |
| } |
| |
| |
| /** |
| * Creates a new {@code NormalizerBase} object for iterating over the |
| * normalized form of the given text. |
| * <p> |
| * @param iter The input text to be normalized. The normalization |
| * will start at the beginning of the string. |
| * |
| * @param mode The normalization mode. |
| * |
| * @param opt Any optional features to be enabled. |
| * Currently the only available option is {@link #UNICODE_3_2}. |
| * If you want the default behavior corresponding to one of the |
| * standard Unicode Normalization Forms, use 0 for this argument. |
| * @stable ICU 2.6 |
| */ |
| public NormalizerBase(CharacterIterator iter, Mode mode, int opt) { |
| this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone()); |
| this.mode = mode; |
| this.options = opt; |
| norm2 = mode.getNormalizer2(opt); |
| buffer = new StringBuilder(); |
| } |
| |
| public NormalizerBase(CharacterIterator iter, Mode mode) { |
| this(iter, mode, 0); |
| } |
| |
| /** |
| * Clones this {@code NormalizerBase} object. All properties of this |
| * object are duplicated in the new object, including the cloning of any |
| * {@link CharacterIterator} that was passed in to the constructor |
| * or to {@link #setText(CharacterIterator) setText}. |
| * However, the text storage underlying |
| * the {@code CharacterIterator} is not duplicated unless the |
| * iterator's {@code clone} method does so. |
| * @stable ICU 2.8 |
| */ |
| public Object clone() { |
| try { |
| NormalizerBase copy = (NormalizerBase) super.clone(); |
| copy.text = (UCharacterIterator) text.clone(); |
| copy.mode = mode; |
| copy.options = options; |
| copy.norm2 = norm2; |
| copy.buffer = new StringBuilder(buffer); |
| copy.bufferPos = bufferPos; |
| copy.currentIndex = currentIndex; |
| copy.nextIndex = nextIndex; |
| return copy; |
| } |
| catch (CloneNotSupportedException e) { |
| throw new InternalError(e.toString(), e); |
| } |
| } |
| |
| /** |
| * Normalizes a {@code String} using the given normalization operation. |
| * <p> |
| * The {@code options} parameter specifies which optional |
| * {@code NormalizerBase} features are to be enabled for this operation. |
| * Currently the only available option is {@link #UNICODE_3_2}. |
| * If you want the default behavior corresponding to one of the standard |
| * Unicode Normalization Forms, use 0 for this argument. |
| * <p> |
| * @param str the input string to be normalized. |
| * @param mode the normalization mode |
| * @param options the optional features to be enabled. |
| * @return String the normalized string |
| * @stable ICU 2.6 |
| */ |
| public static String normalize(String str, Mode mode, int options) { |
| return mode.getNormalizer2(options).normalize(str); |
| } |
| |
| public static String normalize(String str, Normalizer.Form form) { |
| return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST); |
| } |
| |
| public static String normalize(String str, Normalizer.Form form, int options) { |
| return NormalizerBase.normalize(str, toMode(form), options); |
| } |
| |
| /** |
| * Test if a string is in a given normalization form. |
| * This is semantically equivalent to source.equals(normalize(source, mode)). |
| * |
| * Unlike quickCheck(), this function returns a definitive result, |
| * never a "maybe". |
| * For NFD, NFKD, and FCD, both functions work exactly the same. |
| * For NFC and NFKC where quickCheck may return "maybe", this function will |
| * perform further tests to arrive at a true/false result. |
| * @param str the input string to be checked to see if it is |
| * normalized |
| * @param mode the normalization mode |
| * @param options Options for use with exclusion set and tailored Normalization |
| * The only option that is currently recognized is UNICODE_3_2 |
| * @see #isNormalized |
| * @stable ICU 2.6 |
| */ |
| public static boolean isNormalized(String str, Mode mode, int options) { |
| return mode.getNormalizer2(options).isNormalized(str); |
| } |
| |
| public static boolean isNormalized(String str, Normalizer.Form form) { |
| return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST); |
| } |
| |
| public static boolean isNormalized(String str, Normalizer.Form form, int options) { |
| return NormalizerBase.isNormalized(str, toMode(form), options); |
| } |
| |
| //------------------------------------------------------------------------- |
| // Iteration API |
| //------------------------------------------------------------------------- |
| |
| /** |
| * Return the current character in the normalized text. |
| * @return The codepoint as an int |
| * @stable ICU 2.8 |
| */ |
| public int current() { |
| if(bufferPos<buffer.length() || nextNormalize()) { |
| return buffer.codePointAt(bufferPos); |
| } else { |
| return DONE; |
| } |
| } |
| |
| /** |
| * Return the next character in the normalized text and advance |
| * the iteration position by one. If the end |
| * of the text has already been reached, {@link #DONE} is returned. |
| * @return The codepoint as an int |
| * @stable ICU 2.8 |
| */ |
| public int next() { |
| if(bufferPos<buffer.length() || nextNormalize()) { |
| int c=buffer.codePointAt(bufferPos); |
| bufferPos+=Character.charCount(c); |
| return c; |
| } else { |
| return DONE; |
| } |
| } |
| |
| /** |
| * Return the previous character in the normalized text and decrement |
| * the iteration position by one. If the beginning |
| * of the text has already been reached, {@link #DONE} is returned. |
| * @return The codepoint as an int |
| * @stable ICU 2.8 |
| */ |
| public int previous() { |
| if(bufferPos>0 || previousNormalize()) { |
| int c=buffer.codePointBefore(bufferPos); |
| bufferPos-=Character.charCount(c); |
| return c; |
| } else { |
| return DONE; |
| } |
| } |
| |
| /** |
| * Reset the index to the beginning of the text. |
| * This is equivalent to setIndexOnly(startIndex)). |
| * @stable ICU 2.8 |
| */ |
| public void reset() { |
| text.setIndex(0); |
| currentIndex=nextIndex=0; |
| clearBuffer(); |
| } |
| |
| /** |
| * Set the iteration position in the input text that is being normalized, |
| * without any immediate normalization. |
| * After setIndexOnly(), getIndex() will return the same index that is |
| * specified here. |
| * |
| * @param index the desired index in the input text. |
| * @stable ICU 2.8 |
| */ |
| public void setIndexOnly(int index) { |
| text.setIndex(index); // validates index |
| currentIndex=nextIndex=index; |
| clearBuffer(); |
| } |
| |
| /** |
| * Set the iteration position in the input text that is being normalized |
| * and return the first normalized character at that position. |
| * <p> |
| * <b>Note:</b> This method sets the position in the <em>input</em> text, |
| * while {@link #next} and {@link #previous} iterate through characters |
| * in the normalized <em>output</em>. This means that there is not |
| * necessarily a one-to-one correspondence between characters returned |
| * by {@code next} and {@code previous} and the indices passed to and |
| * returned from {@code setIndex} and {@link #getIndex}. |
| * <p> |
| * @param index the desired index in the input text. |
| * |
| * @return the first normalized character that is the result of iterating |
| * forward starting at the given index. |
| * |
| * @throws IllegalArgumentException if the given index is less than |
| * {@link #getBeginIndex} or greater than {@link #getEndIndex}. |
| * deprecated ICU 3.2 |
| * @obsolete ICU 3.2 |
| */ |
| public int setIndex(int index) { |
| setIndexOnly(index); |
| return current(); |
| } |
| |
| /** |
| * Retrieve the index of the start of the input text. This is the begin |
| * index of the {@code CharacterIterator} or the start (i.e. 0) of the |
| * {@code String} over which this {@code NormalizerBase} is iterating |
| * @deprecated ICU 2.2. Use startIndex() instead. |
| * @return The codepoint as an int |
| * @see #startIndex |
| */ |
| @Deprecated |
| public int getBeginIndex() { |
| return 0; |
| } |
| |
| /** |
| * Retrieve the index of the end of the input text. This is the end index |
| * of the {@code CharacterIterator} or the length of the {@code String} |
| * over which this {@code NormalizerBase} is iterating |
| * @deprecated ICU 2.2. Use endIndex() instead. |
| * @return The codepoint as an int |
| * @see #endIndex |
| */ |
| @Deprecated |
| public int getEndIndex() { |
| return endIndex(); |
| } |
| |
| /** |
| * Retrieve the current iteration position in the input text that is |
| * being normalized. This method is useful in applications such as |
| * searching, where you need to be able to determine the position in |
| * the input text that corresponds to a given normalized output character. |
| * <p> |
| * <b>Note:</b> This method sets the position in the <em>input</em>, while |
| * {@link #next} and {@link #previous} iterate through characters in the |
| * <em>output</em>. This means that there is not necessarily a one-to-one |
| * correspondence between characters returned by {@code next} and |
| * {@code previous} and the indices passed to and returned from |
| * {@code setIndex} and {@link #getIndex}. |
| * @return The current iteration position |
| * @stable ICU 2.8 |
| */ |
| public int getIndex() { |
| if(bufferPos<buffer.length()) { |
| return currentIndex; |
| } else { |
| return nextIndex; |
| } |
| } |
| |
| /** |
| * Retrieve the index of the end of the input text. This is the end index |
| * of the {@code CharacterIterator} or the length of the {@code String} |
| * over which this {@code NormalizerBase} is iterating |
| * @return The current iteration position |
| * @stable ICU 2.8 |
| */ |
| public int endIndex() { |
| return text.getLength(); |
| } |
| |
| //------------------------------------------------------------------------- |
| // Iterator attributes |
| //------------------------------------------------------------------------- |
| /** |
| * Set the normalization mode for this object. |
| * <p> |
| * <b>Note:</b>If the normalization mode is changed while iterating |
| * over a string, calls to {@link #next} and {@link #previous} may |
| * return previously buffers characters in the old normalization mode |
| * until the iteration is able to re-sync at the next base character. |
| * It is safest to call {@link #setText setText()}, {@link #first}, |
| * {@link #last}, etc. after calling {@code setMode}. |
| * <p> |
| * @param newMode the new mode for this {@code NormalizerBase}. |
| * The supported modes are: |
| * <ul> |
| * <li>{@link #NFC} - Unicode canonical decompositiion |
| * followed by canonical composition. |
| * <li>{@link #NFKC} - Unicode compatibility decompositiion |
| * follwed by canonical composition. |
| * <li>{@link #NFD} - Unicode canonical decomposition |
| * <li>{@link #NFKD} - Unicode compatibility decomposition. |
| * <li>{@link #NONE} - Do nothing but return characters |
| * from the underlying input text. |
| * </ul> |
| * |
| * @see #getMode |
| * @stable ICU 2.8 |
| */ |
| public void setMode(Mode newMode) { |
| mode = newMode; |
| norm2 = mode.getNormalizer2(options); |
| } |
| |
| /** |
| * Return the basic operation performed by this {@code NormalizerBase} |
| * |
| * @see #setMode |
| * @stable ICU 2.8 |
| */ |
| public Mode getMode() { |
| return mode; |
| } |
| |
| /** |
| * Set the input text over which this {@code NormalizerBase} will iterate. |
| * The iteration position is set to the beginning of the input text. |
| * @param newText The new string to be normalized. |
| * @stable ICU 2.8 |
| */ |
| public void setText(String newText) { |
| UCharacterIterator newIter = UCharacterIterator.getInstance(newText); |
| if (newIter == null) { |
| throw new IllegalStateException("Could not create a new UCharacterIterator"); |
| } |
| text = newIter; |
| reset(); |
| } |
| |
| /** |
| * Set the input text over which this {@code NormalizerBase} will iterate. |
| * The iteration position is set to the beginning of the input text. |
| * @param newText The new string to be normalized. |
| * @stable ICU 2.8 |
| */ |
| public void setText(CharacterIterator newText) { |
| UCharacterIterator newIter = UCharacterIterator.getInstance(newText); |
| if (newIter == null) { |
| throw new IllegalStateException("Could not create a new UCharacterIterator"); |
| } |
| text = newIter; |
| currentIndex=nextIndex=0; |
| clearBuffer(); |
| } |
| |
| private void clearBuffer() { |
| buffer.setLength(0); |
| bufferPos=0; |
| } |
| |
| private boolean nextNormalize() { |
| clearBuffer(); |
| currentIndex=nextIndex; |
| text.setIndex(nextIndex); |
| // Skip at least one character so we make progress. |
| int c=text.nextCodePoint(); |
| if(c<0) { |
| return false; |
| } |
| StringBuilder segment=new StringBuilder().appendCodePoint(c); |
| while((c=text.nextCodePoint())>=0) { |
| if(norm2.hasBoundaryBefore(c)) { |
| text.moveCodePointIndex(-1); |
| break; |
| } |
| segment.appendCodePoint(c); |
| } |
| nextIndex=text.getIndex(); |
| norm2.normalize(segment, buffer); |
| return buffer.length()!=0; |
| } |
| |
| private boolean previousNormalize() { |
| clearBuffer(); |
| nextIndex=currentIndex; |
| text.setIndex(currentIndex); |
| StringBuilder segment=new StringBuilder(); |
| int c; |
| while((c=text.previousCodePoint())>=0) { |
| if(c<=0xffff) { |
| segment.insert(0, (char)c); |
| } else { |
| segment.insert(0, Character.toChars(c)); |
| } |
| if(norm2.hasBoundaryBefore(c)) { |
| break; |
| } |
| } |
| currentIndex=text.getIndex(); |
| norm2.normalize(segment, buffer); |
| bufferPos=buffer.length(); |
| return buffer.length()!=0; |
| } |
| |
| } |