| /* |
| * Copyright (C) 2013 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package com.android.inputmethod.latin.makedict; |
| |
| import com.android.inputmethod.annotations.UsedForTesting; |
| |
| import java.io.File; |
| import java.io.IOException; |
| import java.io.OutputStream; |
| import java.nio.ByteBuffer; |
| |
| /** |
| * Decodes binary files for a FusionDictionary. |
| * |
| * All the methods in this class are static. |
| * |
| * TODO: Move this file to makedict/internal. |
| * TODO: Rename this class to DictDecoderUtils. |
| */ |
| public final class BinaryDictDecoderUtils { |
| private BinaryDictDecoderUtils() { |
| // This utility class is not publicly instantiable. |
| } |
| |
| @UsedForTesting |
| public interface DictBuffer { |
| public int readUnsignedByte(); |
| public int readUnsignedShort(); |
| public int readUnsignedInt24(); |
| public int readInt(); |
| public int position(); |
| public void position(int newPosition); |
| @UsedForTesting |
| public void put(final byte b); |
| public int limit(); |
| @UsedForTesting |
| public int capacity(); |
| } |
| |
| public static final class ByteBufferDictBuffer implements DictBuffer { |
| private ByteBuffer mBuffer; |
| |
| public ByteBufferDictBuffer(final ByteBuffer buffer) { |
| mBuffer = buffer; |
| } |
| |
| @Override |
| public int readUnsignedByte() { |
| return mBuffer.get() & 0xFF; |
| } |
| |
| @Override |
| public int readUnsignedShort() { |
| return mBuffer.getShort() & 0xFFFF; |
| } |
| |
| @Override |
| public int readUnsignedInt24() { |
| final int retval = readUnsignedByte(); |
| return (retval << 16) + readUnsignedShort(); |
| } |
| |
| @Override |
| public int readInt() { |
| return mBuffer.getInt(); |
| } |
| |
| @Override |
| public int position() { |
| return mBuffer.position(); |
| } |
| |
| @Override |
| public void position(int newPos) { |
| mBuffer.position(newPos); |
| } |
| |
| @Override |
| public void put(final byte b) { |
| mBuffer.put(b); |
| } |
| |
| @Override |
| public int limit() { |
| return mBuffer.limit(); |
| } |
| |
| @Override |
| public int capacity() { |
| return mBuffer.capacity(); |
| } |
| } |
| |
| /** |
| * A class grouping utility function for our specific character encoding. |
| */ |
| static final class CharEncoding { |
| private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; |
| private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF; |
| |
| /** |
| * Helper method to find out whether this code fits on one byte |
| */ |
| private static boolean fitsOnOneByte(final int character) { |
| return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE |
| && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE; |
| } |
| |
| /** |
| * Compute the size of a character given its character code. |
| * |
| * Char format is: |
| * 1 byte = bbbbbbbb match |
| * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte |
| * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because |
| * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with |
| * 00011111 would be outside unicode. |
| * else: iso-latin-1 code |
| * This allows for the whole unicode range to be encoded, including chars outside of |
| * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control |
| * characters which should never happen anyway (and still work, but take 3 bytes). |
| * |
| * @param character the character code. |
| * @return the size in binary encoded-form, either 1 or 3 bytes. |
| */ |
| static int getCharSize(final int character) { |
| // See char encoding in FusionDictionary.java |
| if (fitsOnOneByte(character)) return 1; |
| if (FormatSpec.INVALID_CHARACTER == character) return 1; |
| return 3; |
| } |
| |
| /** |
| * Compute the byte size of a character array. |
| */ |
| static int getCharArraySize(final int[] chars) { |
| int size = 0; |
| for (int character : chars) size += getCharSize(character); |
| return size; |
| } |
| |
| /** |
| * Writes a char array to a byte buffer. |
| * |
| * @param codePoints the code point array to write. |
| * @param buffer the byte buffer to write to. |
| * @param index the index in buffer to write the character array to. |
| * @return the index after the last character. |
| */ |
| static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) { |
| for (int codePoint : codePoints) { |
| if (1 == getCharSize(codePoint)) { |
| buffer[index++] = (byte)codePoint; |
| } else { |
| buffer[index++] = (byte)(0xFF & (codePoint >> 16)); |
| buffer[index++] = (byte)(0xFF & (codePoint >> 8)); |
| buffer[index++] = (byte)(0xFF & codePoint); |
| } |
| } |
| return index; |
| } |
| |
| /** |
| * Writes a string with our character format to a byte buffer. |
| * |
| * This will also write the terminator byte. |
| * |
| * @param buffer the byte buffer to write to. |
| * @param origin the offset to write from. |
| * @param word the string to write. |
| * @return the size written, in bytes. |
| */ |
| static int writeString(final byte[] buffer, final int origin, final String word) { |
| final int length = word.length(); |
| int index = origin; |
| for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { |
| final int codePoint = word.codePointAt(i); |
| if (1 == getCharSize(codePoint)) { |
| buffer[index++] = (byte)codePoint; |
| } else { |
| buffer[index++] = (byte)(0xFF & (codePoint >> 16)); |
| buffer[index++] = (byte)(0xFF & (codePoint >> 8)); |
| buffer[index++] = (byte)(0xFF & codePoint); |
| } |
| } |
| buffer[index++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; |
| return index - origin; |
| } |
| |
| /** |
| * Writes a string with our character format to an OutputStream. |
| * |
| * This will also write the terminator byte. |
| * |
| * @param stream the OutputStream to write to. |
| * @param word the string to write. |
| * @return the size written, in bytes. |
| */ |
| static int writeString(final OutputStream stream, final String word) throws IOException { |
| final int length = word.length(); |
| int written = 0; |
| for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { |
| final int codePoint = word.codePointAt(i); |
| final int charSize = getCharSize(codePoint); |
| if (1 == charSize) { |
| stream.write((byte) codePoint); |
| } else { |
| stream.write((byte) (0xFF & (codePoint >> 16))); |
| stream.write((byte) (0xFF & (codePoint >> 8))); |
| stream.write((byte) (0xFF & codePoint)); |
| } |
| written += charSize; |
| } |
| stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR); |
| written += FormatSpec.PTNODE_TERMINATOR_SIZE; |
| return written; |
| } |
| |
| /** |
| * Reads a string from a DictBuffer. This is the converse of the above method. |
| */ |
| static String readString(final DictBuffer dictBuffer) { |
| final StringBuilder s = new StringBuilder(); |
| int character = readChar(dictBuffer); |
| while (character != FormatSpec.INVALID_CHARACTER) { |
| s.appendCodePoint(character); |
| character = readChar(dictBuffer); |
| } |
| return s.toString(); |
| } |
| |
| /** |
| * Reads a character from the buffer. |
| * |
| * This follows the character format documented earlier in this source file. |
| * |
| * @param dictBuffer the buffer, positioned over an encoded character. |
| * @return the character code. |
| */ |
| static int readChar(final DictBuffer dictBuffer) { |
| int character = dictBuffer.readUnsignedByte(); |
| if (!fitsOnOneByte(character)) { |
| if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) { |
| return FormatSpec.INVALID_CHARACTER; |
| } |
| character <<= 16; |
| character += dictBuffer.readUnsignedShort(); |
| } |
| return character; |
| } |
| } |
| |
| /** |
| * Reads and returns the PtNode count out of a buffer and forwards the pointer. |
| */ |
| /* package */ static int readPtNodeCount(final DictBuffer dictBuffer) { |
| final int msb = dictBuffer.readUnsignedByte(); |
| if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= msb) { |
| return msb; |
| } else { |
| return ((FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT & msb) << 8) |
| + dictBuffer.readUnsignedByte(); |
| } |
| } |
| |
| /** |
| * Finds, as a string, the word at the position passed as an argument. |
| * |
| * @param dictDecoder the dict decoder. |
| * @param headerSize the size of the header. |
| * @param pos the position to seek. |
| * @return the word with its frequency, as a weighted string. |
| */ |
| @UsedForTesting |
| /* package for tests */ static WeightedString getWordAtPosition(final DictDecoder dictDecoder, |
| final int headerSize, final int pos) { |
| final WeightedString result; |
| final int originalPos = dictDecoder.getPosition(); |
| dictDecoder.setPosition(pos); |
| result = getWordAtPositionWithoutParentAddress(dictDecoder, headerSize, pos); |
| dictDecoder.setPosition(originalPos); |
| return result; |
| } |
| |
| private static WeightedString getWordAtPositionWithoutParentAddress( |
| final DictDecoder dictDecoder, final int headerSize, final int pos) { |
| dictDecoder.setPosition(headerSize); |
| final int count = dictDecoder.readPtNodeCount(); |
| int groupPos = dictDecoder.getPosition(); |
| final StringBuilder builder = new StringBuilder(); |
| WeightedString result = null; |
| |
| PtNodeInfo last = null; |
| for (int i = count - 1; i >= 0; --i) { |
| PtNodeInfo info = dictDecoder.readPtNode(groupPos); |
| groupPos = info.mEndAddress; |
| if (info.mOriginalAddress == pos) { |
| builder.append(new String(info.mCharacters, 0, info.mCharacters.length)); |
| result = new WeightedString(builder.toString(), info.mProbabilityInfo); |
| break; // and return |
| } |
| if (BinaryDictIOUtils.hasChildrenAddress(info.mChildrenAddress)) { |
| if (info.mChildrenAddress > pos) { |
| if (null == last) continue; |
| builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); |
| dictDecoder.setPosition(last.mChildrenAddress); |
| i = dictDecoder.readPtNodeCount(); |
| groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i); |
| last = null; |
| continue; |
| } |
| last = info; |
| } |
| if (0 == i && BinaryDictIOUtils.hasChildrenAddress(last.mChildrenAddress)) { |
| builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); |
| dictDecoder.setPosition(last.mChildrenAddress); |
| i = dictDecoder.readPtNodeCount(); |
| groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i); |
| last = null; |
| continue; |
| } |
| } |
| return result; |
| } |
| |
| /** |
| * Helper method to pass a file name instead of a File object to isBinaryDictionary. |
| */ |
| public static boolean isBinaryDictionary(final String filename) { |
| final File file = new File(filename); |
| return isBinaryDictionary(file); |
| } |
| |
| /** |
| * Basic test to find out whether the file is a binary dictionary or not. |
| * |
| * @param file The file to test. |
| * @return true if it's a binary dictionary, false otherwise |
| */ |
| public static boolean isBinaryDictionary(final File file) { |
| final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length()); |
| if (dictDecoder == null) { |
| return false; |
| } |
| return dictDecoder.hasValidRawBinaryDictionary(); |
| } |
| } |