blob: 4bad166b55ef92a39c2bbee7bd148e43fbad8daa [file] [log] [blame]
/*
**********************************************************************
* Copyright (c) 2006-2007, Google and others. All Rights Reserved.
**********************************************************************
* Author: Mark Davis
**********************************************************************
*/
package org.unicode.cldr.util;
import java.util.ArrayList;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.CollationElementIterator;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;
/**
* This is intended to be a reference implementation for StringSearch. The
* implementation is thus not high performance; it just transforms both the key
* (what is searched for) and the target into collationElement space, and
* searches there. It is however, architecturally cleaner in that it first has a
* search that finds and extended range, then picks a boundary within that.
* <p>
* The key is that there is a match for a key string in a target text at positions <a,b> iff collator.compare(key,
* target.substring(a,b)) == 0.
*
* @author markdavis
*/
public class ReferenceStringSearch {
private static final int PADDING = 3;
private RuleBasedCollator collator = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);
private BreakIterator breaker;
private String key;
private String target;
// all of the following are in processed CollationElementSpace
private int[] keyBuffer;
private CollationElementIterator2 targetIterator;
private int[] targetBuffer;
private int targetBufferStart;
private int targetBufferLength;
// Map from target items back to native indices
private int[] targetBackMapBefore;
private int[] targetBackMapAfter;
private int lastAfter;
// input flags to indicate which way we want to go within the range of
// possible boundaries
boolean widestStart, widestLimit;
/**
* A struct for showing match information with the range of possible
* boundaries.
*/
public static class ExtendedRange {
int minStart, maxStart, minLimit, maxLimit;
public String toString() {
return minStart + ", " + maxStart + ", " + minLimit + ", " + maxLimit;
}
public String toString(String key, String target) {
return "'" + target.substring(0, minStart) + "["
+ target.substring(minStart, maxStart) + "{"
+ target.substring(maxStart, minLimit) + "}"
+ target.substring(minLimit, maxLimit) + "]"
+ target.substring(maxLimit) + "'";
}
}
/**
* A struct that just shows the start/end, based on the settings given.
*/
public static class Range {
public int start;
public int limit;
public String toString() {
return start + ", " + limit;
}
public String toString(String key, String target) {
return "'" + target.substring(0, start) + "["
+ target.substring(start, limit) + "]"
+ target.substring(limit) + "'";
}
}
public RuleBasedCollator getCollator() {
return collator;
}
/**
* If the collator settings are changed externally, be sure to call
* setCollator();
*/
public ReferenceStringSearch setCollator(RuleBasedCollator collator) {
this.collator = collator;
targetIterator = new CollationElementIterator2(collator);
// reset the key and target, since their collationElements may change
if (key != null) {
setKey(key);
}
if (target != null) {
setTarget(target);
}
return this;
}
public BreakIterator getBreaker() {
return breaker;
}
public ReferenceStringSearch setBreaker(BreakIterator breaker) {
this.breaker = breaker;
if (target != null) {
breaker.setText(target);
}
return this;
}
/**
* If true, will pick the largest possible limit for a match.
*/
public boolean isWidestLimit() {
return widestLimit;
}
/**
* If true, will pick the largest possible limit for a match.
*/
public void setWidestLimit(boolean widestLimit) {
this.widestLimit = widestLimit;
}
/**
* If true, will pick the least possible start for a match.
*/
public boolean isWidestStart() {
return widestStart;
}
/**
* If true, will pick the least possible start for a match.
*/
public void setWidestStart(boolean widestStart) {
this.widestStart = widestStart;
}
public String getKey() {
return key;
}
public ReferenceStringSearch setKey(String key) {
this.key = key;
ArrayList<Integer> keyBufferList = new ArrayList<Integer>();
CollationElementIterator2 keyIterator = new CollationElementIterator2(collator).setText(key);
while (true) {
int collationElement = keyIterator.nextProcessed();
if (collationElement == CollationElementIterator.NULLORDER) {
break;
}
// store the primary, plus the index before and after.
keyBufferList.add(collationElement);
}
keyBuffer = getIntBuffer(keyBufferList);
return this;
}
public int getNativeOffset() {
return targetBackMapBefore[targetBufferStart];
}
public ReferenceStringSearch setNativeOffset(int nativeOffset) {
if (nativeOffset < 0 || nativeOffset > target.length()) {
throw new ArrayIndexOutOfBoundsException();
}
// use dumb implementation. Better implementation would leverage
// contents of current buffer
targetIterator.setOffset(nativeOffset);
targetBufferStart = 0;
targetBufferLength = 0;
lastAfter = 0;
if (nativeOffset != 0) {
// we have to reset lastAfter!
targetIterator.previousProcessed();
lastAfter = targetIterator.offsetAfter;
}
fillBuffer();
return this;
}
public String getTarget() {
return target;
}
/**
* Set the text to search within.
*/
public ReferenceStringSearch setTarget(String target) {
this.target = target;
if (breaker != null) {
breaker.setText(target);
}
targetIterator.setText(target); // TODO optimize creation
// make the buffer just a bit bigger than we need
// note: we could optimize to reuse buffers, but probably not worth it
targetBuffer = new int[keyBuffer.length + PADDING];
targetBackMapBefore = new int[keyBuffer.length + PADDING];
targetBackMapAfter = new int[keyBuffer.length + PADDING];
targetBufferStart = 0;
lastAfter = 0;
targetBufferLength = 0;
fillBuffer();
return this;
}
/**
* We keep the circular buffer as start + length instead of start + limit so
* we can distinguish the 0 case.
*
* @return
*/
private boolean shiftBuffer() {
lastAfter = targetBackMapAfter[targetBufferStart]; // remember last after
++targetBufferStart;
if (targetBufferStart >= targetBuffer.length) {
targetBufferStart = 0;
}
--targetBufferLength;
return fillBuffer();
}
private boolean fillBuffer() {
// TODO mark as done so we don't call extra at the end
while (targetBufferLength < keyBuffer.length + 1) {
int ce = targetIterator.nextProcessed();
if (ce == CollationElementIterator.NULLORDER) {
return false;
}
int targetBufferLimit = targetBufferStart + targetBufferLength;
if (targetBufferLimit >= targetBuffer.length) {
targetBufferLimit -= targetBuffer.length;
}
targetBuffer[targetBufferLimit] = ce;
targetBackMapBefore[targetBufferLimit] = targetIterator.offsetBefore;
targetBackMapAfter[targetBufferLimit] = targetIterator.offsetAfter;
++targetBufferLength;
}
return true;
}
/**
* Find the next match, and set the min/maxStart/Limit. Return false if not
* found. Here is an example, where [...] is the maximal match, and {..} the
* minimal.
*
* <pre>
* Raw Positions of 'eß' in ' ess eß ESS̀ '
* '[ {ess} ]eß ESS̀ '
* ' ess[ {eß} ]ESS̀ '
* ' ess eß[ {ESS}̀ ]'
* </pre>
*
* Matches may overlap. Thus
*
* <pre>
* Raw Positions of 'a!a' in 'b.a.a.a.b'
* 'b[.{a.a}.]a.b'
* 'b.a[.{a.a}.]b'
* </pre>
*/
public boolean searchForwards(ExtendedRange position) {
while (targetBufferLength >= keyBuffer.length) {
if (matchesAt()) {
// minStart is from the After of previous item.
position.minStart = lastAfter;
position.maxStart = targetBackMapBefore[targetBufferStart];
int last = targetBufferStart + keyBuffer.length - 1;
if (last >= targetBuffer.length) {
last -= targetBuffer.length;
}
position.minLimit = targetBackMapAfter[last];
// maxLimit is from the Before of the next item
// or, if we are at the very end of the text, the text length.
if (targetBufferLength == keyBuffer.length) {
position.maxLimit = target.length();
} else {
++last; // we reuse the position since we are already there.
if (last >= targetBuffer.length) {
last -= targetBuffer.length;
}
position.maxLimit = targetBackMapBefore[last];
}
shiftBuffer(); // move to next offset
return true;
}
shiftBuffer(); // move to next offset
}
return false;
}
/**
* Simple match at position.
*/
private boolean matchesAt() {
int j = targetBufferStart;
for (int i = 0; i < keyBuffer.length; ++i) {
if (keyBuffer[i] != targetBuffer[j]) {
return false;
}
++j;
if (j >= targetBuffer.length) {
j = 0;
}
}
return true;
}
private ExtendedRange internalPosition = new ExtendedRange();
/**
* This is the main public interface for searching. It filters out anything
* that doesn't match the breaker, and adjusts the boundaries to the max/min
* permitted. Examples:
*
* <pre>
* Positions of 'eß' in ' ess eß ESS̀ '
* ' [ess] eß ESS̀ '
* ' ess [eß] ESS̀ '
* ' ess eß [ESS̀] '
* Count: 3
*
* Positions of 'a!a' in 'b.a.a.a.b'
* 'b.[a.a].a.b'
* 'b.a.[a.a].b'
* Count: 2
* </pre>
*
* @param position
* @return
*/
public boolean searchForwards(Range position) {
while (true) {
boolean succeeds = searchForwards(internalPosition);
if (!succeeds) return false;
position.start = getBoundary(breaker, internalPosition.minStart, internalPosition.maxStart, !widestStart);
if (position.start == -1) continue; // failed to find the right boundary
position.limit = getBoundary(breaker, internalPosition.minLimit, internalPosition.maxLimit, widestLimit);
if (position.limit == -1) continue; // failed to find the right boundary
return true;
}
}
/****************** PRIVATES ***************/
/**
* This really ought to be just methods on CollationElementIterator.
*/
public static class CollationElementIterator2 {
private CollationElementIterator keyIterator;
private int strengthMask;
private int variableTop;
private int offsetBefore;
private int offsetAfter;
public int getOffsetBefore() {
return offsetBefore;
}
public int getOffsetAfter() {
return offsetAfter;
}
public CollationElementIterator2 reset() {
keyIterator.reset();
return this;
}
public CollationElementIterator2 setOffset(int offset) {
keyIterator.setOffset(offset);
return this;
}
public CollationElementIterator2 setText(String source) {
keyIterator.setText(source);
return this;
}
public CollationElementIterator2(RuleBasedCollator collator) {
// gather some information that we will need later
strengthMask = 0xFFFF0000;
variableTop = !collator.isAlternateHandlingShifted() ? -1 : collator.getVariableTop() | 0xFFFF;
// this needs to be fixed a bit for case-level, etc.
switch (collator.getStrength()) {
case Collator.PRIMARY:
strengthMask = 0xFFFF0000;
break;
case Collator.SECONDARY:
strengthMask = 0xFFFFFF00;
break;
default:
strengthMask = 0xFFFFFFFF;
break;
}
keyIterator = collator.getCollationElementIterator("");
}
/**
* This should be a method on CollationElementIterator. Returns next
* non-zero collation element, setting indexBefore, indexAfter. Should also
* process shifted and strength, masking as needed. If a collation element
* has a continuation, then the indexAfter = indexBefore, for example, if
* [CE1,CE2] form a single collation element for the characters between
* native indexes 5 and 8, (C2 being a continuation, then the result of two
* calls to nextProcessed would be [CE1, 5, 5] then [CE1, 5,8].
* <p>
* previousProcessed would do similar things, backwards.
*
*/
int nextProcessed() {
while (true) {
offsetBefore = keyIterator.getOffset();
int collationElement = keyIterator.next();
if (collationElement != CollationElementIterator.NULLORDER) {
// note: the collation element iterator ought to give us processed values, but it doesn't
// so we have to simulate that.
collationElement &= strengthMask; // mask to only the strengths we have
// check for shifted.
// TODO This is not exactly right, and we also need to eject any following combining marks,
// so fix later.
if (collationElement < variableTop && collationElement > 0xFFFF) {
continue;
}
if (collationElement == 0) {
continue;
}
}
offsetAfter = keyIterator.getOffset();
return collationElement;
}
}
int previousProcessed() {
while (true) {
offsetAfter = keyIterator.getOffset();
int collationElement = keyIterator.previous();
if (collationElement != CollationElementIterator.NULLORDER) {
// note: the collation element iterator ought to give us processed values, but it doesn't
// so we have to simulate that.
collationElement &= strengthMask; // mask to only the strengths we have
// check for shifted.
// TODO This is not exactly right, and we also need to eject any following combining marks,
// so fix later.
if (collationElement < variableTop && collationElement > 0xFFFF) {
continue;
}
if (collationElement == 0) {
continue;
}
}
offsetBefore = keyIterator.getOffset();
return collationElement;
}
}
}
/**
* Utility
*
* @param keyBufferList
* @return
*/
private int[] getIntBuffer(ArrayList<Integer> keyBufferList) {
int[] buffer = new int[keyBufferList.size()];
for (int i = 0; i < keyBufferList.size(); ++i) {
buffer[i] = keyBufferList.get(i).intValue();
}
return buffer;
}
/**
* This really should be on breakIterator, so it can be done more efficiently.
* <p>
* Get the item that is on a boundary according to the breaker, and is between the input boundaries. The boolean
* greatest controls whether we pick the least or the greatest possible offset that works according to the breaker.
* <p>
* Returns -1 if there is no valid offset according to the breaker.
*
* @param minBoundary
* @param maxBoundary
* @param greatest
* @return
*/
public static int getBoundary(BreakIterator breaker, int minBoundary, int maxBoundary, boolean greatest) {
if (breaker == null) return greatest ? maxBoundary : minBoundary;
int result;
// this may or may not be the most efficient way to test; ask Andy
if (greatest) {
result = breaker.preceding(maxBoundary + 1);
if (result < minBoundary) {
result = -1;
}
} else {
result = breaker.following(minBoundary - 1);
if (result < minBoundary) {
result = -1;
}
}
return result;
}
}