tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java - platform/packages/inputmethods/LatinIME - Git at Google

 /*
  * Copyright (C) 2013 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package com.android.inputmethod.latin.makedict;

 import com.android.inputmethod.annotations.UsedForTesting;

 import java.io.File;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.nio.ByteBuffer;

 /**
  * Decodes binary files for a FusionDictionary.
  *
  * All the methods in this class are static.
  *
  * TODO: Move this file to makedict/internal.
  * TODO: Rename this class to DictDecoderUtils.
  */
 public final class BinaryDictDecoderUtils {
     private BinaryDictDecoderUtils() {
         // This utility class is not publicly instantiable.
     }

     @UsedForTesting
     public interface DictBuffer {
         public int readUnsignedByte();
         public int readUnsignedShort();
         public int readUnsignedInt24();
         public int readInt();
         public int position();
         public void position(int newPosition);
         @UsedForTesting
         public void put(final byte b);
         public int limit();
         @UsedForTesting
         public int capacity();
     }

     public static final class ByteBufferDictBuffer implements DictBuffer {
         private ByteBuffer mBuffer;

         public ByteBufferDictBuffer(final ByteBuffer buffer) {
             mBuffer = buffer;
         }

         @Override
         public int readUnsignedByte() {
             return mBuffer.get() & 0xFF;
         }

         @Override
         public int readUnsignedShort() {
             return mBuffer.getShort() & 0xFFFF;
         }

         @Override
         public int readUnsignedInt24() {
             final int retval = readUnsignedByte();
             return (retval << 16) + readUnsignedShort();
         }

         @Override
         public int readInt() {
             return mBuffer.getInt();
         }

         @Override
         public int position() {
             return mBuffer.position();
         }

         @Override
         public void position(int newPos) {
             mBuffer.position(newPos);
         }

         @Override
         public void put(final byte b) {
             mBuffer.put(b);
         }

         @Override
         public int limit() {
             return mBuffer.limit();
         }

         @Override
         public int capacity() {
             return mBuffer.capacity();
         }
     }

     /**
      * A class grouping utility function for our specific character encoding.
      */
     static final class CharEncoding {
         private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
         private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF;

         /**
          * Helper method to find out whether this code fits on one byte
          */
         private static boolean fitsOnOneByte(final int character) {
             return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE
                     && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE;
         }

         /**
          * Compute the size of a character given its character code.
          *
          * Char format is:
          * 1 byte = bbbbbbbb match
          * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
          * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
          *       unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
          *       00011111 would be outside unicode.
          * else: iso-latin-1 code
          * This allows for the whole unicode range to be encoded, including chars outside of
          * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
          * characters which should never happen anyway (and still work, but take 3 bytes).
          *
          * @param character the character code.
          * @return the size in binary encoded-form, either 1 or 3 bytes.
          */
         static int getCharSize(final int character) {
             // See char encoding in FusionDictionary.java
             if (fitsOnOneByte(character)) return 1;
             if (FormatSpec.INVALID_CHARACTER == character) return 1;
             return 3;
         }

         /**
          * Compute the byte size of a character array.
          */
         static int getCharArraySize(final int[] chars) {
             int size = 0;
             for (int character : chars) size += getCharSize(character);
             return size;
         }

         /**
          * Writes a char array to a byte buffer.
          *
          * @param codePoints the code point array to write.
          * @param buffer the byte buffer to write to.
          * @param index the index in buffer to write the character array to.
          * @return the index after the last character.
          */
         static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) {
             for (int codePoint : codePoints) {
                 if (1 == getCharSize(codePoint)) {
                     buffer[index++] = (byte)codePoint;
                 } else {
                     buffer[index++] = (byte)(0xFF & (codePoint >> 16));
                     buffer[index++] = (byte)(0xFF & (codePoint >> 8));
                     buffer[index++] = (byte)(0xFF & codePoint);
                 }
             }
             return index;
         }

         /**
          * Writes a string with our character format to a byte buffer.
          *
          * This will also write the terminator byte.
          *
          * @param buffer the byte buffer to write to.
          * @param origin the offset to write from.
          * @param word the string to write.
          * @return the size written, in bytes.
          */
         static int writeString(final byte[] buffer, final int origin, final String word) {
             final int length = word.length();
             int index = origin;
             for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
                 final int codePoint = word.codePointAt(i);
                 if (1 == getCharSize(codePoint)) {
                     buffer[index++] = (byte)codePoint;
                 } else {
                     buffer[index++] = (byte)(0xFF & (codePoint >> 16));
                     buffer[index++] = (byte)(0xFF & (codePoint >> 8));
                     buffer[index++] = (byte)(0xFF & codePoint);
                 }
             }
             buffer[index++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
             return index - origin;
         }

         /**
          * Writes a string with our character format to an OutputStream.
          *
          * This will also write the terminator byte.
          *
          * @param stream the OutputStream to write to.
          * @param word the string to write.
          * @return the size written, in bytes.
          */
         static int writeString(final OutputStream stream, final String word) throws IOException {
             final int length = word.length();
             int written = 0;
             for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
                 final int codePoint = word.codePointAt(i);
                 final int charSize = getCharSize(codePoint);
                 if (1 == charSize) {
                     stream.write((byte) codePoint);
                 } else {
                     stream.write((byte) (0xFF & (codePoint >> 16)));
                     stream.write((byte) (0xFF & (codePoint >> 8)));
                     stream.write((byte) (0xFF & codePoint));
                 }
                 written += charSize;
             }
             stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR);
             written += FormatSpec.PTNODE_TERMINATOR_SIZE;
             return written;
         }

         /**
          * Reads a string from a DictBuffer. This is the converse of the above method.
          */
         static String readString(final DictBuffer dictBuffer) {
             final StringBuilder s = new StringBuilder();
             int character = readChar(dictBuffer);
             while (character != FormatSpec.INVALID_CHARACTER) {
                 s.appendCodePoint(character);
                 character = readChar(dictBuffer);
             }
             return s.toString();
         }

         /**
          * Reads a character from the buffer.
          *
          * This follows the character format documented earlier in this source file.
          *
          * @param dictBuffer the buffer, positioned over an encoded character.
          * @return the character code.
          */
         static int readChar(final DictBuffer dictBuffer) {
             int character = dictBuffer.readUnsignedByte();
             if (!fitsOnOneByte(character)) {
                 if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) {
                     return FormatSpec.INVALID_CHARACTER;
                 }
                 character <<= 16;
                 character += dictBuffer.readUnsignedShort();
             }
             return character;
         }
     }

     /**
      * Reads and returns the PtNode count out of a buffer and forwards the pointer.
      */
     /* package */ static int readPtNodeCount(final DictBuffer dictBuffer) {
         final int msb = dictBuffer.readUnsignedByte();
         if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= msb) {
             return msb;
         } else {
             return ((FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT & msb) << 8)
                     + dictBuffer.readUnsignedByte();
         }
     }

     /**
      * Finds, as a string, the word at the position passed as an argument.
      *
      * @param dictDecoder the dict decoder.
      * @param headerSize the size of the header.
      * @param pos the position to seek.
      * @return the word with its frequency, as a weighted string.
      */
     @UsedForTesting
     /* package for tests */ static WeightedString getWordAtPosition(final DictDecoder dictDecoder,
             final int headerSize, final int pos) {
         final WeightedString result;
         final int originalPos = dictDecoder.getPosition();
         dictDecoder.setPosition(pos);
         result = getWordAtPositionWithoutParentAddress(dictDecoder, headerSize, pos);
         dictDecoder.setPosition(originalPos);
         return result;
     }

     private static WeightedString getWordAtPositionWithoutParentAddress(
             final DictDecoder dictDecoder, final int headerSize, final int pos) {
         dictDecoder.setPosition(headerSize);
         final int count = dictDecoder.readPtNodeCount();
         int groupPos = dictDecoder.getPosition();
         final StringBuilder builder = new StringBuilder();
         WeightedString result = null;

         PtNodeInfo last = null;
         for (int i = count - 1; i >= 0; --i) {
             PtNodeInfo info = dictDecoder.readPtNode(groupPos);
             groupPos = info.mEndAddress;
             if (info.mOriginalAddress == pos) {
                 builder.append(new String(info.mCharacters, 0, info.mCharacters.length));
                 result = new WeightedString(builder.toString(), info.mProbabilityInfo);
                 break; // and return
             }
             if (BinaryDictIOUtils.hasChildrenAddress(info.mChildrenAddress)) {
                 if (info.mChildrenAddress > pos) {
                     if (null == last) continue;
                     builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
                     dictDecoder.setPosition(last.mChildrenAddress);
                     i = dictDecoder.readPtNodeCount();
                     groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i);
                     last = null;
                     continue;
                 }
                 last = info;
             }
             if (0 == i && BinaryDictIOUtils.hasChildrenAddress(last.mChildrenAddress)) {
                 builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
                 dictDecoder.setPosition(last.mChildrenAddress);
                 i = dictDecoder.readPtNodeCount();
                 groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i);
                 last = null;
                 continue;
             }
         }
         return result;
     }

     /**
      * Helper method to pass a file name instead of a File object to isBinaryDictionary.
      */
     public static boolean isBinaryDictionary(final String filename) {
         final File file = new File(filename);
         return isBinaryDictionary(file);
     }

     /**
      * Basic test to find out whether the file is a binary dictionary or not.
      *
      * @param file The file to test.
      * @return true if it's a binary dictionary, false otherwise
      */
     public static boolean isBinaryDictionary(final File file) {
         final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length());
         if (dictDecoder == null) {
             return false;
         }
         return dictDecoder.hasValidRawBinaryDictionary();
     }
 }
	/*
	* Copyright (C) 2013 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package com.android.inputmethod.latin.makedict;

	import com.android.inputmethod.annotations.UsedForTesting;

	import java.io.File;
	import java.io.IOException;
	import java.io.OutputStream;
	import java.nio.ByteBuffer;

	/**
	* Decodes binary files for a FusionDictionary.
	*
	* All the methods in this class are static.
	*
	* TODO: Move this file to makedict/internal.
	* TODO: Rename this class to DictDecoderUtils.
	*/
	public final class BinaryDictDecoderUtils {
	private BinaryDictDecoderUtils() {
	// This utility class is not publicly instantiable.
	}

	@UsedForTesting
	public interface DictBuffer {
	public int readUnsignedByte();
	public int readUnsignedShort();
	public int readUnsignedInt24();
	public int readInt();
	public int position();
	public void position(int newPosition);
	@UsedForTesting
	public void put(final byte b);
	public int limit();
	@UsedForTesting
	public int capacity();
	}

	public static final class ByteBufferDictBuffer implements DictBuffer {
	private ByteBuffer mBuffer;

	public ByteBufferDictBuffer(final ByteBuffer buffer) {
	mBuffer = buffer;
	}

	@Override
	public int readUnsignedByte() {
	return mBuffer.get() & 0xFF;
	}

	@Override
	public int readUnsignedShort() {
	return mBuffer.getShort() & 0xFFFF;
	}

	@Override
	public int readUnsignedInt24() {
	final int retval = readUnsignedByte();
	return (retval << 16) + readUnsignedShort();
	}

	@Override
	public int readInt() {
	return mBuffer.getInt();
	}

	@Override
	public int position() {
	return mBuffer.position();
	}

	@Override
	public void position(int newPos) {
	mBuffer.position(newPos);
	}

	@Override
	public void put(final byte b) {
	mBuffer.put(b);
	}

	@Override
	public int limit() {
	return mBuffer.limit();
	}

	@Override
	public int capacity() {
	return mBuffer.capacity();
	}
	}

	/**
	* A class grouping utility function for our specific character encoding.
	*/
	static final class CharEncoding {
	private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
	private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF;

	/**
	* Helper method to find out whether this code fits on one byte
	*/
	private static boolean fitsOnOneByte(final int character) {
	return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE
	&& character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE;
	}

	/**
	* Compute the size of a character given its character code.
	*
	* Char format is:
	* 1 byte = bbbbbbbb match
	* case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
	* else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
	* unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
	* 00011111 would be outside unicode.
	* else: iso-latin-1 code
	* This allows for the whole unicode range to be encoded, including chars outside of
	* the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
	* characters which should never happen anyway (and still work, but take 3 bytes).
	*
	* @param character the character code.
	* @return the size in binary encoded-form, either 1 or 3 bytes.
	*/
	static int getCharSize(final int character) {
	// See char encoding in FusionDictionary.java
	if (fitsOnOneByte(character)) return 1;
	if (FormatSpec.INVALID_CHARACTER == character) return 1;
	return 3;
	}

	/**
	* Compute the byte size of a character array.
	*/
	static int getCharArraySize(final int[] chars) {
	int size = 0;
	for (int character : chars) size += getCharSize(character);
	return size;
	}

	/**
	* Writes a char array to a byte buffer.
	*
	* @param codePoints the code point array to write.
	* @param buffer the byte buffer to write to.
	* @param index the index in buffer to write the character array to.
	* @return the index after the last character.
	*/
	static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) {
	for (int codePoint : codePoints) {
	if (1 == getCharSize(codePoint)) {
	buffer[index++] = (byte)codePoint;
	} else {
	buffer[index++] = (byte)(0xFF & (codePoint >> 16));
	buffer[index++] = (byte)(0xFF & (codePoint >> 8));
	buffer[index++] = (byte)(0xFF & codePoint);
	}
	}
	return index;
	}

	/**
	* Writes a string with our character format to a byte buffer.
	*
	* This will also write the terminator byte.
	*
	* @param buffer the byte buffer to write to.
	* @param origin the offset to write from.
	* @param word the string to write.
	* @return the size written, in bytes.
	*/
	static int writeString(final byte[] buffer, final int origin, final String word) {
	final int length = word.length();
	int index = origin;
	for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
	final int codePoint = word.codePointAt(i);
	if (1 == getCharSize(codePoint)) {
	buffer[index++] = (byte)codePoint;
	} else {
	buffer[index++] = (byte)(0xFF & (codePoint >> 16));
	buffer[index++] = (byte)(0xFF & (codePoint >> 8));
	buffer[index++] = (byte)(0xFF & codePoint);
	}
	}
	buffer[index++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
	return index - origin;
	}

	/**
	* Writes a string with our character format to an OutputStream.
	*
	* This will also write the terminator byte.
	*
	* @param stream the OutputStream to write to.
	* @param word the string to write.
	* @return the size written, in bytes.
	*/
	static int writeString(final OutputStream stream, final String word) throws IOException {
	final int length = word.length();
	int written = 0;
	for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
	final int codePoint = word.codePointAt(i);
	final int charSize = getCharSize(codePoint);
	if (1 == charSize) {
	stream.write((byte) codePoint);
	} else {
	stream.write((byte) (0xFF & (codePoint >> 16)));
	stream.write((byte) (0xFF & (codePoint >> 8)));
	stream.write((byte) (0xFF & codePoint));
	}
	written += charSize;
	}
	stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR);
	written += FormatSpec.PTNODE_TERMINATOR_SIZE;
	return written;
	}

	/**
	* Reads a string from a DictBuffer. This is the converse of the above method.
	*/
	static String readString(final DictBuffer dictBuffer) {
	final StringBuilder s = new StringBuilder();
	int character = readChar(dictBuffer);
	while (character != FormatSpec.INVALID_CHARACTER) {
	s.appendCodePoint(character);
	character = readChar(dictBuffer);
	}
	return s.toString();
	}

	/**
	* Reads a character from the buffer.
	*
	* This follows the character format documented earlier in this source file.
	*
	* @param dictBuffer the buffer, positioned over an encoded character.
	* @return the character code.
	*/
	static int readChar(final DictBuffer dictBuffer) {
	int character = dictBuffer.readUnsignedByte();
	if (!fitsOnOneByte(character)) {
	if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) {
	return FormatSpec.INVALID_CHARACTER;
	}
	character <<= 16;
	character += dictBuffer.readUnsignedShort();
	}
	return character;
	}
	}

	/**
	* Reads and returns the PtNode count out of a buffer and forwards the pointer.
	*/
	/* package */ static int readPtNodeCount(final DictBuffer dictBuffer) {
	final int msb = dictBuffer.readUnsignedByte();
	if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= msb) {
	return msb;
	} else {
	return ((FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT & msb) << 8)
	+ dictBuffer.readUnsignedByte();
	}
	}

	/**
	* Finds, as a string, the word at the position passed as an argument.
	*
	* @param dictDecoder the dict decoder.
	* @param headerSize the size of the header.
	* @param pos the position to seek.
	* @return the word with its frequency, as a weighted string.
	*/
	@UsedForTesting
	/* package for tests */ static WeightedString getWordAtPosition(final DictDecoder dictDecoder,
	final int headerSize, final int pos) {
	final WeightedString result;
	final int originalPos = dictDecoder.getPosition();
	dictDecoder.setPosition(pos);
	result = getWordAtPositionWithoutParentAddress(dictDecoder, headerSize, pos);
	dictDecoder.setPosition(originalPos);
	return result;
	}

	private static WeightedString getWordAtPositionWithoutParentAddress(
	final DictDecoder dictDecoder, final int headerSize, final int pos) {
	dictDecoder.setPosition(headerSize);
	final int count = dictDecoder.readPtNodeCount();
	int groupPos = dictDecoder.getPosition();
	final StringBuilder builder = new StringBuilder();
	WeightedString result = null;

	PtNodeInfo last = null;
	for (int i = count - 1; i >= 0; --i) {
	PtNodeInfo info = dictDecoder.readPtNode(groupPos);
	groupPos = info.mEndAddress;
	if (info.mOriginalAddress == pos) {
	builder.append(new String(info.mCharacters, 0, info.mCharacters.length));
	result = new WeightedString(builder.toString(), info.mProbabilityInfo);
	break; // and return
	}
	if (BinaryDictIOUtils.hasChildrenAddress(info.mChildrenAddress)) {
	if (info.mChildrenAddress > pos) {
	if (null == last) continue;
	builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
	dictDecoder.setPosition(last.mChildrenAddress);
	i = dictDecoder.readPtNodeCount();
	groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i);
	last = null;
	continue;
	}
	last = info;
	}
	if (0 == i && BinaryDictIOUtils.hasChildrenAddress(last.mChildrenAddress)) {
	builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
	dictDecoder.setPosition(last.mChildrenAddress);
	i = dictDecoder.readPtNodeCount();
	groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i);
	last = null;
	continue;
	}
	}
	return result;
	}

	/**
	* Helper method to pass a file name instead of a File object to isBinaryDictionary.
	*/
	public static boolean isBinaryDictionary(final String filename) {
	final File file = new File(filename);
	return isBinaryDictionary(file);
	}

	/**
	* Basic test to find out whether the file is a binary dictionary or not.
	*
	* @param file The file to test.
	* @return true if it's a binary dictionary, false otherwise
	*/
	public static boolean isBinaryDictionary(final File file) {
	final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length());
	if (dictDecoder == null) {
	return false;
	}
	return dictDecoder.hasValidRawBinaryDictionary();
	}
	}