blob: 60e35b7c620836623d4c16ed23ae482c3257f132 [file] [log] [blame]
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2009-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.IOException;
/**
* Normalization filtered by a UnicodeSet.
* Normalizes portions of the text contained in the filter set and leaves
* portions not contained in the filter set unchanged.
* Filtering is done via UnicodeSet.span(..., UnicodeSet.SpanCondition.SIMPLE).
* Not-in-the-filter text is treated as "is normalized" and "quick check yes".
* This class implements all of (and only) the Normalizer2 API.
* An instance of this class is unmodifiable/immutable.
* @stable ICU 4.4
* @author Markus W. Scherer
*/
class FilteredNormalizer2 extends Normalizer2 {
/**
* Constructs a filtered normalizer wrapping any Normalizer2 instance
* and a filter set.
* Both are aliased and must not be modified or deleted while this object
* is used.
* The filter set should be frozen; otherwise the performance will suffer greatly.
* @param n2 wrapped Normalizer2 instance
* @param filterSet UnicodeSet which determines the characters to be normalized
* @stable ICU 4.4
*/
public FilteredNormalizer2(Normalizer2 n2, UnicodeSet filterSet) {
norm2=n2;
set=filterSet;
}
/**
* {@inheritDoc}
* @stable ICU 4.4
*/
@Override
public StringBuilder normalize(CharSequence src, StringBuilder dest) {
if(dest==src) {
throw new IllegalArgumentException();
}
dest.setLength(0);
normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
return dest;
}
/**
* {@inheritDoc}
* @stable ICU 4.6
*/
@Override
public Appendable normalize(CharSequence src, Appendable dest) {
if(dest==src) {
throw new IllegalArgumentException();
}
return normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
}
/**
* {@inheritDoc}
* @stable ICU 4.4
*/
@Override
public StringBuilder normalizeSecondAndAppend(
StringBuilder first, CharSequence second) {
return normalizeSecondAndAppend(first, second, true);
}
/**
* {@inheritDoc}
* @stable ICU 4.4
*/
@Override
public StringBuilder append(StringBuilder first, CharSequence second) {
return normalizeSecondAndAppend(first, second, false);
}
/**
* {@inheritDoc}
* @stable ICU 4.6
*/
@Override
public String getDecomposition(int c) {
return set.contains(c) ? norm2.getDecomposition(c) : null;
}
/**
* {@inheritDoc}
* @stable ICU 49
*/
@Override
public int getCombiningClass(int c) {
return set.contains(c) ? norm2.getCombiningClass(c) : 0;
}
/**
* {@inheritDoc}
* @stable ICU 4.4
*/
@Override
public boolean isNormalized(CharSequence s) {
UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
int spanLimit=set.span(s, prevSpanLimit, spanCondition);
if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
spanCondition=UnicodeSet.SpanCondition.SIMPLE;
} else {
if(!norm2.isNormalized(s.subSequence(prevSpanLimit, spanLimit))) {
return false;
}
spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
return true;
}
/**
* {@inheritDoc}
* @stable ICU 4.4
*/
@Override
public int spanQuickCheckYes(CharSequence s) {
UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
int spanLimit=set.span(s, prevSpanLimit, spanCondition);
if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
spanCondition=UnicodeSet.SpanCondition.SIMPLE;
} else {
int yesLimit=
prevSpanLimit+
norm2.spanQuickCheckYes(s.subSequence(prevSpanLimit, spanLimit));
if(yesLimit<spanLimit) {
return yesLimit;
}
spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
return s.length();
}
/**
* {@inheritDoc}
* @stable ICU 4.4
*/
@Override
public boolean hasBoundaryBefore(int c) {
return !set.contains(c) || norm2.hasBoundaryBefore(c);
}
// Internal: No argument checking, and appends to dest.
// Pass as input spanCondition the one that is likely to yield a non-zero
// span length at the start of src.
// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
// UnicodeSet.SpanCondition.SIMPLE should be passed in for the start of src
// and UnicodeSet.SpanCondition.NOT_CONTAINED should be passed in if we continue after
// an in-filter prefix.
private Appendable normalize(CharSequence src, Appendable dest,
UnicodeSet.SpanCondition spanCondition) {
// Don't throw away destination buffer between iterations.
StringBuilder tempDest=new StringBuilder();
try {
for(int prevSpanLimit=0; prevSpanLimit<src.length();) {
int spanLimit=set.span(src, prevSpanLimit, spanCondition);
int spanLength=spanLimit-prevSpanLimit;
if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
if(spanLength!=0) {
dest.append(src, prevSpanLimit, spanLimit);
}
spanCondition=UnicodeSet.SpanCondition.SIMPLE;
} else {
if(spanLength!=0) {
// Not norm2.normalizeSecondAndAppend() because we do not want
// to modify the non-filter part of dest.
dest.append(norm2.normalize(src.subSequence(prevSpanLimit, spanLimit), tempDest));
}
spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
} catch(IOException e) {
throw new InternalError(e.toString(), e);
}
return dest;
}
private StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second,
boolean doNormalize) {
if(first==second) {
throw new IllegalArgumentException();
}
if(first.length()==0) {
if(doNormalize) {
return normalize(second, first);
} else {
return first.append(second);
}
}
// merge the in-filter suffix of the first string with the in-filter prefix of the second
int prefixLimit=set.span(second, 0, UnicodeSet.SpanCondition.SIMPLE);
if(prefixLimit!=0) {
CharSequence prefix=second.subSequence(0, prefixLimit);
int suffixStart=set.spanBack(first, 0x7fffffff, UnicodeSet.SpanCondition.SIMPLE);
if(suffixStart==0) {
if(doNormalize) {
norm2.normalizeSecondAndAppend(first, prefix);
} else {
norm2.append(first, prefix);
}
} else {
StringBuilder middle=new StringBuilder(
first.subSequence(suffixStart, first.length()));
if(doNormalize) {
norm2.normalizeSecondAndAppend(middle, prefix);
} else {
norm2.append(middle, prefix);
}
first.delete(suffixStart, 0x7fffffff).append(middle);
}
}
if(prefixLimit<second.length()) {
CharSequence rest=second.subSequence(prefixLimit, second.length());
if(doNormalize) {
normalize(rest, first, UnicodeSet.SpanCondition.NOT_CONTAINED);
} else {
first.append(rest);
}
}
return first;
}
private Normalizer2 norm2;
private UnicodeSet set;
};