| /* |
| * Copyright (C) 2013 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package com.android.inputmethod.latin.makedict; |
| |
| import com.android.inputmethod.annotations.UsedForTesting; |
| import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; |
| import com.android.inputmethod.latin.makedict.BinaryDictEncoderUtils.CodePointTable; |
| import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; |
| import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; |
| import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; |
| |
| import java.io.File; |
| import java.io.FileNotFoundException; |
| import java.io.FileOutputStream; |
| import java.io.IOException; |
| import java.io.OutputStream; |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.HashMap; |
| import java.util.Iterator; |
| import java.util.Map.Entry; |
| |
| /** |
| * An implementation of DictEncoder for version 2 binary dictionary. |
| */ |
| @UsedForTesting |
| public class Ver2DictEncoder implements DictEncoder { |
| |
| private final File mDictFile; |
| private OutputStream mOutStream; |
| private byte[] mBuffer; |
| private int mPosition; |
| private final int mCodePointTableMode; |
| public static final int CODE_POINT_TABLE_OFF = 0; |
| public static final int CODE_POINT_TABLE_ON = 1; |
| |
| @UsedForTesting |
| public Ver2DictEncoder(final File dictFile, final int codePointTableMode) { |
| mDictFile = dictFile; |
| mOutStream = null; |
| mBuffer = null; |
| mCodePointTableMode = codePointTableMode; |
| } |
| |
| // This constructor is used only by BinaryDictOffdeviceUtilsTests. |
| // If you want to use this in the production code, you should consider keeping consistency of |
| // the interface of Ver3DictDecoder by using factory. |
| @UsedForTesting |
| public Ver2DictEncoder(final OutputStream outStream) { |
| mDictFile = null; |
| mOutStream = outStream; |
| mCodePointTableMode = CODE_POINT_TABLE_OFF; |
| } |
| |
| private void openStream() throws FileNotFoundException { |
| mOutStream = new FileOutputStream(mDictFile); |
| } |
| |
| private void close() throws IOException { |
| if (mOutStream != null) { |
| mOutStream.close(); |
| mOutStream = null; |
| } |
| } |
| |
| // Package for testing |
| static CodePointTable makeCodePointTable(final FusionDictionary dict) { |
| final HashMap<Integer, Integer> codePointOccurrenceCounts = new HashMap<>(); |
| for (final WordProperty word : dict) { |
| // Store per code point occurrence |
| final String wordString = word.mWord; |
| for (int i = 0; i < wordString.length(); ++i) { |
| final int codePoint = Character.codePointAt(wordString, i); |
| if (codePointOccurrenceCounts.containsKey(codePoint)) { |
| codePointOccurrenceCounts.put(codePoint, |
| codePointOccurrenceCounts.get(codePoint) + 1); |
| } else { |
| codePointOccurrenceCounts.put(codePoint, 1); |
| } |
| } |
| } |
| final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray = |
| new ArrayList<>(codePointOccurrenceCounts.entrySet()); |
| // Descending order sort by occurrence (value side) |
| Collections.sort(codePointOccurrenceArray, new Comparator<Entry<Integer, Integer>>() { |
| @Override |
| public int compare(final Entry<Integer, Integer> a, final Entry<Integer, Integer> b) { |
| if (a.getValue() != b.getValue()) { |
| return b.getValue().compareTo(a.getValue()); |
| } |
| return b.getKey().compareTo(a.getKey()); |
| } |
| }); |
| int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE; |
| // Temporary map for writing of nodes |
| final HashMap<Integer, Integer> codePointToOneByteCodeMap = new HashMap<>(); |
| for (final Entry<Integer, Integer> entry : codePointOccurrenceArray) { |
| // Put a relation from the original code point to the one byte code. |
| codePointToOneByteCodeMap.put(entry.getKey(), currentCodePointTableIndex); |
| if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) { |
| break; |
| } |
| } |
| // codePointToOneByteCodeMap for writing the trie |
| // codePointOccurrenceArray for writing the header |
| return new CodePointTable(codePointToOneByteCodeMap, codePointOccurrenceArray); |
| } |
| |
| @Override |
| public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) |
| throws IOException, UnsupportedFormatException { |
| // We no longer support anything but the latest version of v2. |
| if (formatOptions.mVersion != FormatSpec.VERSION202) { |
| throw new UnsupportedFormatException( |
| "The given format options has wrong version number : " |
| + formatOptions.mVersion); |
| } |
| |
| if (mOutStream == null) { |
| openStream(); |
| } |
| |
| // Make code point conversion table ordered by occurrence of code points |
| // Version 201 or later have codePointTable |
| final CodePointTable codePointTable; |
| if (mCodePointTableMode == CODE_POINT_TABLE_OFF || formatOptions.mVersion |
| < FormatSpec.MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE) { |
| codePointTable = new CodePointTable(); |
| } else { |
| codePointTable = makeCodePointTable(dict); |
| } |
| |
| BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions, |
| codePointTable.mCodePointOccurrenceArray); |
| |
| // Addresses are limited to 3 bytes, but since addresses can be relative to each node |
| // array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding |
| // the order of the PtNode arrays becomes a quite complicated problem, because though the |
| // dictionary itself does not have a size limit, each node array must still be within 16MB |
| // of all its children and parents. As long as this is ensured, the dictionary file may |
| // grow to any size. |
| |
| // Leave the choice of the optimal node order to the flattenTree function. |
| MakedictLog.i("Flattening the tree..."); |
| ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray); |
| |
| MakedictLog.i("Computing addresses..."); |
| BinaryDictEncoderUtils.computeAddresses(dict, flatNodes, |
| codePointTable.mCodePointToOneByteCodeMap); |
| MakedictLog.i("Checking PtNode array..."); |
| if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes); |
| |
| // Create a buffer that matches the final dictionary size. |
| final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); |
| final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; |
| mBuffer = new byte[bufferSize]; |
| |
| MakedictLog.i("Writing file..."); |
| |
| for (PtNodeArray nodeArray : flatNodes) { |
| BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray, |
| codePointTable.mCodePointToOneByteCodeMap); |
| } |
| if (MakedictLog.DBG) BinaryDictEncoderUtils.showStatistics(flatNodes); |
| mOutStream.write(mBuffer, 0, mPosition); |
| |
| MakedictLog.i("Done"); |
| close(); |
| } |
| |
| @Override |
| public void setPosition(final int position) { |
| if (mBuffer == null || position < 0 || position >= mBuffer.length) return; |
| mPosition = position; |
| } |
| |
| @Override |
| public int getPosition() { |
| return mPosition; |
| } |
| |
| @Override |
| public void writePtNodeCount(final int ptNodeCount) { |
| final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount); |
| if (countSize != 1 && countSize != 2) { |
| throw new RuntimeException("Strange size from getGroupCountSize : " + countSize); |
| } |
| final int encodedPtNodeCount = (countSize == 2) ? |
| (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount; |
| mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, encodedPtNodeCount, |
| countSize); |
| } |
| |
| private void writePtNodeFlags(final PtNode ptNode, |
| final HashMap<Integer, Integer> codePointToOneByteCodeMap) { |
| final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, |
| codePointToOneByteCodeMap); |
| mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, |
| BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos), |
| FormatSpec.PTNODE_FLAGS_SIZE); |
| } |
| |
| private void writeCharacters(final int[] codePoints, final boolean hasSeveralChars, |
| final HashMap<Integer, Integer> codePointToOneByteCodeMap) { |
| mPosition = CharEncoding.writeCharArray(codePoints, mBuffer, mPosition, |
| codePointToOneByteCodeMap); |
| if (hasSeveralChars) { |
| mBuffer[mPosition++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR; |
| } |
| } |
| |
| private void writeFrequency(final int frequency) { |
| if (frequency >= 0) { |
| mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, frequency, |
| FormatSpec.PTNODE_FREQUENCY_SIZE); |
| } |
| } |
| |
| private void writeChildrenPosition(final PtNode ptNode, |
| final HashMap<Integer, Integer> codePointToOneByteCodeMap) { |
| final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, |
| codePointToOneByteCodeMap); |
| mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition, |
| childrenPos); |
| } |
| |
| /** |
| * Write a bigram attributes list to mBuffer. |
| * |
| * @param bigrams the bigram attributes list. |
| * @param dict the dictionary the node array is a part of (for relative offsets). |
| */ |
| private void writeBigrams(final ArrayList<WeightedString> bigrams, |
| final FusionDictionary dict) { |
| if (bigrams == null) return; |
| |
| final Iterator<WeightedString> bigramIterator = bigrams.iterator(); |
| while (bigramIterator.hasNext()) { |
| final WeightedString bigram = bigramIterator.next(); |
| final PtNode target = |
| FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord); |
| final int addressOfBigram = target.mCachedAddressAfterUpdate; |
| final int unigramFrequencyForThisWord = target.getProbability(); |
| final int offset = addressOfBigram |
| - (mPosition + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); |
| final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(), |
| offset, bigram.getProbability(), unigramFrequencyForThisWord, bigram.mWord); |
| mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, bigramFlags, |
| FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); |
| mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition, |
| Math.abs(offset)); |
| } |
| } |
| |
| @Override |
| public void writePtNode(final PtNode ptNode, final FusionDictionary dict, |
| final HashMap<Integer, Integer> codePointToOneByteCodeMap) { |
| writePtNodeFlags(ptNode, codePointToOneByteCodeMap); |
| writeCharacters(ptNode.mChars, ptNode.hasSeveralChars(), codePointToOneByteCodeMap); |
| writeFrequency(ptNode.getProbability()); |
| writeChildrenPosition(ptNode, codePointToOneByteCodeMap); |
| writeBigrams(ptNode.mBigrams, dict); |
| } |
| } |