tools/java/org/unicode/cldr/util/StateDictionaryBuilder.java - platform/external/cldr - Git at Google

 /*
  **********************************************************************
  * Copyright (c) 2006-2007, Google and others.  All Rights Reserved.
  **********************************************************************
  * Author: Mark Davis
  **********************************************************************
  */
 package org.unicode.cldr.util;

 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.TreeMap;

 import org.unicode.cldr.util.Dictionary.DictionaryBuilder;
 import org.unicode.cldr.util.IntMap.BasicIntMapFactory;
 import org.unicode.cldr.util.IntMap.IntMapFactory;
 import org.unicode.cldr.util.StateDictionary.Cell;
 import org.unicode.cldr.util.StateDictionary.Row;
 import org.unicode.cldr.util.StateDictionary.Row.Uniqueness;

 /**
  * A simple state-table based dictionary builder.
  *
  * @author markdavis
  * @param <T>
  *            the return type for the dictionary
  */
 public class StateDictionaryBuilder<T> implements DictionaryBuilder<T> {

     private static final boolean SHOW_SIZE = true;

     // only used while building
     private Row buildingCurrentAddRow;
     private int builtTotalBytes;
     private int builtTotalStrings;

     // results of build
     private ArrayList<Row> builtRows;
     private Row builtBaseRow;
     private IntMap<T> builtResults;
     private int builtMaxByteLength;

     private StringByteConverter byteConverter = new CompactStringByteConverter(true); // new StringUtf8Converter(); //
     // new ByteString(false); // new
     // ByteString(true); //

     private IntMapFactory<T> intMapFactory = new BasicIntMapFactory<T>();

     /**
      * Get/set the IntMapFactory used to store the values for T. The default is BasicIntMapFactory.
      *
      * @return
      */
     public IntMapFactory<T> getIntMapFactory() {
         return intMapFactory;
     }

     public StateDictionaryBuilder<T> setIntMapFactory(IntMapFactory<T> intMapFactory) {
         this.intMapFactory = intMapFactory;
         return this;
     }

     /**
      * Get/Set the StringByteConverter used to convert strings to bytes and back. The default is a compacted form:
      * ByteString(true).
      *
      * @return
      */
     public StringByteConverter getByteConverter() {
         return byteConverter;
     }

     public StateDictionaryBuilder<T> setByteConverter(StringByteConverter byteConverter) {
         this.byteConverter = byteConverter;
         return this;
     }

     /**
      * Create a new simple StateDictionary. This format is relatively fast to
      * produce, and has a fair amount of compaction. The Map must be sorted
      * according to Dictionary.CHAR_SEQUENCE_COMPARATOR. It must not contain the key "".
      *
      * @param source
      * @return
      */
     public StateDictionary<T> make(Map<CharSequence, T> source) {
         // clear out state
         buildingCurrentAddRow = null;
         builtTotalBytes = builtTotalStrings = builtMaxByteLength = 0;
         builtRows = new ArrayList<Row>();
         builtBaseRow = makeRow();
         builtResults = intMapFactory.make(source.values());
         if (SHOW_SIZE) System.out.println("***VALUE STORAGE: " + builtResults.approximateStorage());

         Map<T, Integer> valueToInt = builtResults.getValueMap();
         Map<byte[], Integer> sorted = new TreeMap<byte[], Integer>(SHORTER_BYTE_ARRAY_COMPARATOR);
         for (CharSequence text : source.keySet()) {
             sorted.put(byteConverter.toBytes(text), valueToInt.get(source.get(text)));
         }

         for (byte[] key : sorted.keySet()) {
             addMapping(key, sorted.get(key));
         }

         // now compact the rows
         // first find out which rows are equivalent (recursively)
         Map<Row, Row> replacements = new HashMap<Row, Row>();
         {
             Map<Row, Row> equivalents = new TreeMap<Row, Row>(StateDictionary.rowComparator);
             for (Row row : builtRows) {
                 Row cardinal = equivalents.get(row);
                 if (cardinal == null) {
                     equivalents.put(row, row);
                 } else {
                     replacements.put(row, cardinal);
                 }
             }
         }
         if (SHOW_SIZE) System.out.println("***ROWS: " + builtRows.size() + "\t REPLACEMENTS: " + replacements.size());

         // now replace all references to rows by their equivalents
         for (Row row : builtRows) {
             for (Byte key : row.byteToCell.keySet()) {
                 Cell cell = row.byteToCell.get(key);
                 Row newRow = replacements.get(cell.nextRow);
                 if (newRow != null) {
                     cell.nextRow = newRow;
                 }
             }
         }
         // now compact the rows array
         ArrayList<Row> newRows = new ArrayList<Row>();
         for (Row row : builtRows) {
             if (!replacements.containsKey(row)) {
                 newRows.add(row);
             }
         }
         // and set the Uniqueness values
         setUniqueValues(builtBaseRow);
         builtRows = newRows;
         if (SHOW_SIZE) System.out.println("***ROWS: " + builtRows.size());
         return new StateDictionary<T>(builtBaseRow, builtRows, builtResults, builtMaxByteLength, byteConverter);
     }

     private Row makeRow() {
         Row row = new Row(builtRows.size());
         builtRows.add(row);
         return row;
     }

     private void addMapping(byte[] key, int result) {
         buildingCurrentAddRow = builtBaseRow;

         int lastIndex = key.length - 1;
         for (int i = 0; i <= lastIndex; ++i) {
             result = add(key[i], result, i == lastIndex);
         }
         builtTotalBytes += key.length;
         builtTotalStrings += 1;
         if (builtMaxByteLength < key.length) {
             builtMaxByteLength = key.length;
         }
     }

     private int add(byte key, int result, boolean last) {
         Cell matchingCell = buildingCurrentAddRow.byteToCell.get(key);
         if (matchingCell != null) {
             if (matchingCell.nextRow == null && !last) {
                 matchingCell.nextRow = makeRow();
                 // we add a continuation, so decrement
                 --buildingCurrentAddRow.terminatingReturnCount;
             }
             buildingCurrentAddRow = matchingCell.nextRow;
             return result - matchingCell.deltaResult;
         }
         Cell cell = new Cell();
         buildingCurrentAddRow.byteToCell.put(key, cell);
         cell.deltaResult = result;
         if (last) {
             cell.returns = true;
             ++buildingCurrentAddRow.returnCount;
             ++buildingCurrentAddRow.terminatingReturnCount;
         } else {
             cell.nextRow = buildingCurrentAddRow = makeRow();
         }
         // when we create a new cell for the first time, we deposit the
         // result, so we can clear it now
         return 0;
     }

     /**
      * Follow all the path values, and determine whether all possible results from
      * a row (when following bytes) are the same. If so, set the flag on the row.
      * The return value
      *
      * @param savedMatchValue
      *            TODO
      * @return true if unique
      */
     // NOTE: The way the table is built, we are guaranteed that the current value
     // is "correct" (with no deltas needed) for at least one of the paths
     // resulting from the row, EXCEPT in the very first row. Thus except for the first row,
     // every row will have at least one cell with a zero deltaResult. That in turn means that
     // if there is a deltaResult on a cell, at least one chain resulting from taking that cell
     // will have that result.
     // So, if there are any deltaResults in a row, it is not unique.
     // Otherwise, if any of the cells have non-unique rows, it is not unique
     // Otherwise it is unique.
     // So this traverses the rows, setting the values for anything it hasn't seen.
     // TODO: could be optimized (maybe) by saving the rows seen so far. On the other hand,
     // this might not be worth the time in adding them to a save list.
     private boolean setUniqueValues(Row currentRow) {
         if (currentRow.hasUniqueValue == Uniqueness.UNIQUE) {
             return true;
         }
         // see if there is any reason to make us not Unique
         // According to the structure above:
         // It will be that we have a child that is not unique,
         // or that we have a nonzero delta value
         // We don't have to look at anything else.
         // We DO have to call setUniqueValues on each of our children, to make sure they are set.
         for (Cell cell : currentRow.byteToCell.values()) {
             if (cell.nextRow != null && !setUniqueValues(cell.nextRow)) {
                 currentRow.hasUniqueValue = Uniqueness.AMBIGUOUS;
             } else if (cell.deltaResult != 0) {
                 currentRow.hasUniqueValue = Uniqueness.AMBIGUOUS;
             }
         }
         if (currentRow.hasUniqueValue == Uniqueness.AMBIGUOUS) {
             return false;
         }
         currentRow.hasUniqueValue = Uniqueness.UNIQUE;
         return true;
     }

     static final Comparator<byte[]> SHORTER_BYTE_ARRAY_COMPARATOR = new Comparator<byte[]>() {

         public int compare(byte[] o1, byte[] o2) {
             int minLen = o1.length;
             if (minLen > o2.length) {
                 minLen = o2.length;
             }
             for (int i = 0; i < minLen; ++i) {
                 if (o1[i] != o2[i]) {
                     return o1[i] < o2[i] ? -1 : 1; // return lesser first
                 }
             }
             return o1.length < o2.length ? -1 : o1.length > o2.length ? 1 : 0;
         }

     };
 }
	/*
	**********************************************************************
	* Copyright (c) 2006-2007, Google and others. All Rights Reserved.
	**********************************************************************
	* Author: Mark Davis
	**********************************************************************
	*/
	package org.unicode.cldr.util;

	import java.util.ArrayList;
	import java.util.Comparator;
	import java.util.HashMap;
	import java.util.Map;
	import java.util.TreeMap;

	import org.unicode.cldr.util.Dictionary.DictionaryBuilder;
	import org.unicode.cldr.util.IntMap.BasicIntMapFactory;
	import org.unicode.cldr.util.IntMap.IntMapFactory;
	import org.unicode.cldr.util.StateDictionary.Cell;
	import org.unicode.cldr.util.StateDictionary.Row;
	import org.unicode.cldr.util.StateDictionary.Row.Uniqueness;

	/**
	* A simple state-table based dictionary builder.
	*
	* @author markdavis
	* @param <T>
	* the return type for the dictionary
	*/
	public class StateDictionaryBuilder<T> implements DictionaryBuilder<T> {

	private static final boolean SHOW_SIZE = true;

	// only used while building
	private Row buildingCurrentAddRow;
	private int builtTotalBytes;
	private int builtTotalStrings;

	// results of build
	private ArrayList<Row> builtRows;
	private Row builtBaseRow;
	private IntMap<T> builtResults;
	private int builtMaxByteLength;

	private StringByteConverter byteConverter = new CompactStringByteConverter(true); // new StringUtf8Converter(); //
	// new ByteString(false); // new
	// ByteString(true); //

	private IntMapFactory<T> intMapFactory = new BasicIntMapFactory<T>();

	/**
	* Get/set the IntMapFactory used to store the values for T. The default is BasicIntMapFactory.
	*
	* @return
	*/
	public IntMapFactory<T> getIntMapFactory() {
	return intMapFactory;
	}

	public StateDictionaryBuilder<T> setIntMapFactory(IntMapFactory<T> intMapFactory) {
	this.intMapFactory = intMapFactory;
	return this;
	}

	/**
	* Get/Set the StringByteConverter used to convert strings to bytes and back. The default is a compacted form:
	* ByteString(true).
	*
	* @return
	*/
	public StringByteConverter getByteConverter() {
	return byteConverter;
	}

	public StateDictionaryBuilder<T> setByteConverter(StringByteConverter byteConverter) {
	this.byteConverter = byteConverter;
	return this;
	}

	/**
	* Create a new simple StateDictionary. This format is relatively fast to
	* produce, and has a fair amount of compaction. The Map must be sorted
	* according to Dictionary.CHAR_SEQUENCE_COMPARATOR. It must not contain the key "".
	*
	* @param source
	* @return
	*/
	public StateDictionary<T> make(Map<CharSequence, T> source) {
	// clear out state
	buildingCurrentAddRow = null;
	builtTotalBytes = builtTotalStrings = builtMaxByteLength = 0;
	builtRows = new ArrayList<Row>();
	builtBaseRow = makeRow();
	builtResults = intMapFactory.make(source.values());
	if (SHOW_SIZE) System.out.println("***VALUE STORAGE: " + builtResults.approximateStorage());

	Map<T, Integer> valueToInt = builtResults.getValueMap();
	Map<byte[], Integer> sorted = new TreeMap<byte[], Integer>(SHORTER_BYTE_ARRAY_COMPARATOR);
	for (CharSequence text : source.keySet()) {
	sorted.put(byteConverter.toBytes(text), valueToInt.get(source.get(text)));
	}

	for (byte[] key : sorted.keySet()) {
	addMapping(key, sorted.get(key));
	}

	// now compact the rows
	// first find out which rows are equivalent (recursively)
	Map<Row, Row> replacements = new HashMap<Row, Row>();
	{
	Map<Row, Row> equivalents = new TreeMap<Row, Row>(StateDictionary.rowComparator);
	for (Row row : builtRows) {
	Row cardinal = equivalents.get(row);
	if (cardinal == null) {
	equivalents.put(row, row);
	} else {
	replacements.put(row, cardinal);
	}
	}
	}
	if (SHOW_SIZE) System.out.println("***ROWS: " + builtRows.size() + "\t REPLACEMENTS: " + replacements.size());

	// now replace all references to rows by their equivalents
	for (Row row : builtRows) {
	for (Byte key : row.byteToCell.keySet()) {
	Cell cell = row.byteToCell.get(key);
	Row newRow = replacements.get(cell.nextRow);
	if (newRow != null) {
	cell.nextRow = newRow;
	}
	}
	}
	// now compact the rows array
	ArrayList<Row> newRows = new ArrayList<Row>();
	for (Row row : builtRows) {
	if (!replacements.containsKey(row)) {
	newRows.add(row);
	}
	}
	// and set the Uniqueness values
	setUniqueValues(builtBaseRow);
	builtRows = newRows;
	if (SHOW_SIZE) System.out.println("***ROWS: " + builtRows.size());
	return new StateDictionary<T>(builtBaseRow, builtRows, builtResults, builtMaxByteLength, byteConverter);
	}

	private Row makeRow() {
	Row row = new Row(builtRows.size());
	builtRows.add(row);
	return row;
	}

	private void addMapping(byte[] key, int result) {
	buildingCurrentAddRow = builtBaseRow;

	int lastIndex = key.length - 1;
	for (int i = 0; i <= lastIndex; ++i) {
	result = add(key[i], result, i == lastIndex);
	}
	builtTotalBytes += key.length;
	builtTotalStrings += 1;
	if (builtMaxByteLength < key.length) {
	builtMaxByteLength = key.length;
	}
	}

	private int add(byte key, int result, boolean last) {
	Cell matchingCell = buildingCurrentAddRow.byteToCell.get(key);
	if (matchingCell != null) {
	if (matchingCell.nextRow == null && !last) {
	matchingCell.nextRow = makeRow();
	// we add a continuation, so decrement
	--buildingCurrentAddRow.terminatingReturnCount;
	}
	buildingCurrentAddRow = matchingCell.nextRow;
	return result - matchingCell.deltaResult;
	}
	Cell cell = new Cell();
	buildingCurrentAddRow.byteToCell.put(key, cell);
	cell.deltaResult = result;
	if (last) {
	cell.returns = true;
	++buildingCurrentAddRow.returnCount;
	++buildingCurrentAddRow.terminatingReturnCount;
	} else {
	cell.nextRow = buildingCurrentAddRow = makeRow();
	}
	// when we create a new cell for the first time, we deposit the
	// result, so we can clear it now
	return 0;
	}

	/**
	* Follow all the path values, and determine whether all possible results from
	* a row (when following bytes) are the same. If so, set the flag on the row.
	* The return value
	*
	* @param savedMatchValue
	* TODO
	* @return true if unique
	*/
	// NOTE: The way the table is built, we are guaranteed that the current value
	// is "correct" (with no deltas needed) for at least one of the paths
	// resulting from the row, EXCEPT in the very first row. Thus except for the first row,
	// every row will have at least one cell with a zero deltaResult. That in turn means that
	// if there is a deltaResult on a cell, at least one chain resulting from taking that cell
	// will have that result.
	// So, if there are any deltaResults in a row, it is not unique.
	// Otherwise, if any of the cells have non-unique rows, it is not unique
	// Otherwise it is unique.
	// So this traverses the rows, setting the values for anything it hasn't seen.
	// TODO: could be optimized (maybe) by saving the rows seen so far. On the other hand,
	// this might not be worth the time in adding them to a save list.
	private boolean setUniqueValues(Row currentRow) {
	if (currentRow.hasUniqueValue == Uniqueness.UNIQUE) {
	return true;
	}
	// see if there is any reason to make us not Unique
	// According to the structure above:
	// It will be that we have a child that is not unique,
	// or that we have a nonzero delta value
	// We don't have to look at anything else.
	// We DO have to call setUniqueValues on each of our children, to make sure they are set.
	for (Cell cell : currentRow.byteToCell.values()) {
	if (cell.nextRow != null && !setUniqueValues(cell.nextRow)) {
	currentRow.hasUniqueValue = Uniqueness.AMBIGUOUS;
	} else if (cell.deltaResult != 0) {
	currentRow.hasUniqueValue = Uniqueness.AMBIGUOUS;
	}
	}
	if (currentRow.hasUniqueValue == Uniqueness.AMBIGUOUS) {
	return false;
	}
	currentRow.hasUniqueValue = Uniqueness.UNIQUE;
	return true;
	}

	static final Comparator<byte[]> SHORTER_BYTE_ARRAY_COMPARATOR = new Comparator<byte[]>() {

	public int compare(byte[] o1, byte[] o2) {
	int minLen = o1.length;
	if (minLen > o2.length) {
	minLen = o2.length;
	}
	for (int i = 0; i < minLen; ++i) {
	if (o1[i] != o2[i]) {
	return o1[i] < o2[i] ? -1 : 1; // return lesser first
	}
	}
	return o1.length < o2.length ? -1 : o1.length > o2.length ? 1 : 0;
	}

	};
	}