Generate English Dictionary (version : 200 - may contain bigrams)
- created separate class for bigram
Change-Id: Iaee544c6e62dcd0a90730d20d5c955280e5cf0ae
diff --git a/tools/makedict/src/com/android/tools/dict/BigramDictionary.java b/tools/makedict/src/com/android/tools/dict/BigramDictionary.java
new file mode 100644
index 0000000..35115bf
--- /dev/null
+++ b/tools/makedict/src/com/android/tools/dict/BigramDictionary.java
@@ -0,0 +1,286 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.tools.dict;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+/**
+ * Helper for MakeBinaryDictionary
+ * Deals with all the bigram data
+ */
+public class BigramDictionary {
+
+ /*
+ * Must match the values in the client side which is located in dictionary.cpp & dictionary.h
+ * Changing these values will generate totally different structure which must be also reflected
+ * on the client side.
+ */
+ public static final int FLAG_BIGRAM_READ = 0x80;
+ public static final int FLAG_BIGRAM_CHILDEXIST = 0x40;
+ public static final int FLAG_BIGRAM_CONTINUED = 0x80;
+ public static final int FLAG_BIGRAM_FREQ = 0x7F;
+
+ public static final int FOR_REVERSE_LOOKUPALL = -99;
+
+ public ArrayList<String> mBigramToFill = new ArrayList<String>();
+ public ArrayList<Integer> mBigramToFillAddress = new ArrayList<Integer>();
+
+ public HashMap<String, Bigram> mBi;
+
+ public boolean mHasBigram;
+
+ public BigramDictionary(String bigramSrcFilename, boolean hasBigram) {
+ mHasBigram = hasBigram;
+ loadBigram(bigramSrcFilename);
+ }
+
+ private void loadBigram(String filename) {
+ mBi = new HashMap<String, Bigram>();
+ if (!mHasBigram) {
+ System.out.println("Number of bigrams = " + Bigram.sBigramNum);
+ return;
+ }
+ try {
+ SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
+ parser.parse(new File(filename), new DefaultHandler() {
+ String w1 = null;
+ boolean inWord1 = false;
+ boolean inWord2 = false;
+ int freq = 0, counter = 0;
+ Bigram tempBigram = null;
+
+ @Override
+ public void startElement(String uri, String localName,
+ String qName, Attributes attributes) {
+ if (qName.equals("bi")) {
+ inWord1 = true;
+ w1 = attributes.getValue(0);
+ int count = Integer.parseInt(attributes.getValue(1));
+ tempBigram = new Bigram(count);
+ counter = 0;
+ } else if (qName.equals("w")) {
+ inWord2 = true;
+ String word2 = attributes.getValue(0);
+ int freq = Integer.parseInt(attributes.getValue(1));
+ tempBigram.setWord2(counter, word2, freq);
+ counter++;
+ Bigram.sBigramNum++;
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName,
+ String qName) {
+ if (inWord2) {
+ inWord2 = false;
+ } else if (inWord1) {
+ inWord1 = false;
+ mBi.put(w1, tempBigram);
+ }
+ }
+ });
+ } catch (Exception ioe) {
+ System.err.println("Exception in parsing bigram\n" + ioe);
+ ioe.printStackTrace();
+ }
+ System.out.println("Number of bigrams = " + Bigram.sBigramNum);
+ }
+
+ byte[] writeBigrams(byte[] dict, Map<String, Integer> mDictionary) {
+ for (int i = 0; i < mBigramToFill.size(); i++) {
+ String w1 = mBigramToFill.get(i);
+ int address = mBigramToFillAddress.get(i);
+
+ Bigram temp = mBi.get(w1);
+ int word2Count = temp.count;
+ int j4;
+ for (int j = 0; j < word2Count; j++) {
+ if (!mDictionary.containsKey(temp.word2[j])) {
+ System.out.println("Not in dictionary: " + temp.word2[j]);
+ System.exit(0);
+ } else {
+ j4 = (j * 4);
+ int addressOfWord2 = mDictionary.get(temp.word2[j]);
+ dict[address + j4 + 0] = (byte) (((addressOfWord2 & 0x3F0000) >> 16)
+ | FLAG_BIGRAM_READ);
+ dict[address + j4 + 1] = (byte) ((addressOfWord2 & 0x00FF00) >> 8);
+ dict[address + j4 + 2] = (byte) ((addressOfWord2 & 0x0000FF));
+
+ if (j == (word2Count - 1)) {
+ dict[address + j4 + 3] = (byte) (temp.freq[j] & FLAG_BIGRAM_FREQ);
+ } else {
+ dict[address + j4 + 3] = (byte) ((temp.freq[j] & FLAG_BIGRAM_FREQ)
+ | FLAG_BIGRAM_CONTINUED);
+ }
+ }
+ }
+ }
+
+ return dict;
+ }
+
+ void reverseLookupAll(Map<String, Integer> mDictionary, byte[] dict) {
+ Set<String> st = mDictionary.keySet();
+ for (String s : st) {
+ searchForTerminalNode(mDictionary.get(s), FOR_REVERSE_LOOKUPALL, dict);
+ }
+ }
+
+ void searchForTerminalNode(int bigramAddress, int frequency, byte[] dict) {
+ StringBuilder sb = new StringBuilder(48);
+ int pos;
+ boolean found = false;
+ int followDownBranchAddress = 2;
+ char followingChar = ' ';
+ int depth = 0;
+ int totalLoopCount = 0;
+
+ while (!found) {
+ boolean followDownAddressSearchStop = false;
+ boolean firstAddress = true;
+ boolean haveToSearchAll = true;
+
+ if (depth > 0) {
+ sb.append(followingChar);
+ }
+ pos = followDownBranchAddress; // pos start at count
+ int count = dict[pos] & 0xFF;
+ pos++;
+ for (int i = 0; i < count; i++) {
+ totalLoopCount++;
+ // pos at data
+ pos++;
+ // pos now at flag
+ if (!MakeBinaryDictionary.getFirstBitOfByte(pos, dict)) { // non-terminal
+ if (!followDownAddressSearchStop) {
+ int addr = MakeBinaryDictionary.get22BitAddress(pos, dict);
+ if (addr > bigramAddress) {
+ followDownAddressSearchStop = true;
+ if (firstAddress) {
+ firstAddress = false;
+ haveToSearchAll = true;
+ } else if (!haveToSearchAll) {
+ break;
+ }
+ } else {
+ followDownBranchAddress = addr;
+ followingChar = (char) (0xFF & dict[pos-1]);
+ if(firstAddress) {
+ firstAddress = false;
+ haveToSearchAll = false;
+ }
+ }
+ }
+ pos += 3;
+ } else if (MakeBinaryDictionary.getFirstBitOfByte(pos, dict)) { // terminal
+ // found !!
+ if (bigramAddress == (pos-1)) {
+ sb.append((char) (0xFF & dict[pos-1]));
+ found = true;
+ break;
+ }
+
+ // address + freq (4 byte)
+ if (MakeBinaryDictionary.getSecondBitOfByte(pos, dict)) {
+ if (!followDownAddressSearchStop) {
+ int addr = MakeBinaryDictionary.get22BitAddress(pos, dict);
+ if (addr > bigramAddress) {
+ followDownAddressSearchStop = true;
+ if (firstAddress) {
+ firstAddress = false;
+ haveToSearchAll = true;
+ } else if (!haveToSearchAll) {
+ break;
+ }
+ } else {
+ followDownBranchAddress = addr;
+ followingChar = (char) (0xFF & dict[pos-1]);
+ if(firstAddress) {
+ firstAddress = false;
+ haveToSearchAll = true;
+ }
+ }
+ }
+ pos += 4;
+ } else { // freq only (2 byte)
+ pos += 2;
+ }
+ // skipping bigram
+ int bigramExist = (dict[pos] & FLAG_BIGRAM_READ);
+ if (bigramExist > 0) {
+ int nextBigramExist = 1;
+ while (nextBigramExist > 0) {
+ pos += 3;
+ nextBigramExist = (dict[pos++] & FLAG_BIGRAM_CONTINUED);
+ }
+ } else {
+ pos++;
+ }
+ }
+ }
+ depth++;
+ if (followDownBranchAddress == 2) {
+ System.out.println("ERROR!!! Cannot find bigram!!");
+ System.exit(0);
+ }
+ }
+
+ if (frequency == FOR_REVERSE_LOOKUPALL) {
+ System.out.println("Reverse: " + sb.toString() + " (" + bigramAddress + ")"
+ + " Loop: " + totalLoopCount);
+ } else {
+ System.out.println(" bigram: " + sb.toString() + " (" + bigramAddress + ") freq: "
+ + frequency + " Loop: " + totalLoopCount);
+ }
+ }
+
+ static class Bigram {
+ String[] word2;
+ int[] freq;
+ int count;
+ static int sBigramNum = 0;
+
+ String getSecondWord(int i) {
+ return word2[i];
+ }
+
+ int getFrequency(int i) {
+ return (freq[i] == 0) ? 1 : freq[i];
+ }
+
+ void setWord2(int index, String word2, int freq) {
+ this.word2[index] = word2;
+ this.freq[index] = freq;
+ }
+
+ public Bigram(int word2Count) {
+ count = word2Count;
+ word2 = new String[word2Count];
+ freq = new int[word2Count];
+ }
+ }
+}
diff --git a/tools/makedict/src/com/android/tools/dict/MakeBinaryDictionary.java b/tools/makedict/src/com/android/tools/dict/MakeBinaryDictionary.java
index 77a6401..0627879 100755
--- a/tools/makedict/src/com/android/tools/dict/MakeBinaryDictionary.java
+++ b/tools/makedict/src/com/android/tools/dict/MakeBinaryDictionary.java
@@ -31,15 +31,24 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
/**
- * Compresses a list of words and frequencies into a tree structured binary dictionary.
+ * Compresses a list of words, frequencies, and bigram data
+ * into a tree structured binary dictionary.
+ * Dictionary Version: 200 (may contain bigrams)
+ * Version number started from 200 rather than 1 because we wanted to prevent number of roots in
+ * any old dictionaries being mistaken as the version number. There is not a chance that there
+ * will be more than 200 roots. Version number should be increased when there is structural change
+ * in the data. There is no need to increase the version when only the words in the data changes.
*/
public class MakeBinaryDictionary {
+ private static final int VERSION_NUM = 200;
+
public static final int ALPHA_SIZE = 256;
public static final String TAG_WORD = "w";
@@ -48,13 +57,15 @@
private static final int FLAG_ADDRESS_MASK = 0x400000;
private static final int FLAG_TERMINAL_MASK = 0x800000;
private static final int ADDRESS_MASK = 0x3FFFFF;
-
+
public static final CharNode EMPTY_NODE = new CharNode();
List<CharNode> roots;
Map<String, Integer> mDictionary;
int mWordCount;
-
+
+ BigramDictionary bigramDict;
+
static class CharNode {
char data;
int freq;
@@ -68,29 +79,46 @@
}
public static void usage() {
- System.err.println("Usage: makedict <src.xml> <dest.dict>");
+ System.err.println("Usage: makedict -s <src_dict.xml> [-b <src_bigram.xml>] "
+ + "-d <dest.dict>");
System.exit(-1);
}
public static void main(String[] args) {
- if (args.length < 2) {
- usage();
+ int checkSource = -1;
+ int checkBigram = -1;
+ int checkDest = -1;
+ for (int i = 0; i < args.length; i+=2) {
+ if (args[i].equals("-s")) checkSource = (i + 1);
+ if (args[i].equals("-b")) checkBigram = (i + 1);
+ if (args[i].equals("-d")) checkDest = (i + 1);
+ }
+ if (checkSource >= 0 && checkBigram >= 0 && checkDest >= 0 && args.length == 6) {
+ new MakeBinaryDictionary(args[checkSource], args[checkBigram], args[checkDest]);
+ } else if (checkSource >= 0 && checkDest >= 0 && args.length == 4) {
+ new MakeBinaryDictionary(args[checkSource], null, args[checkDest]);
} else {
- new MakeBinaryDictionary(args[0], args[1]);
+ usage();
}
}
- public MakeBinaryDictionary(String srcFilename, String destFilename) {
+ public MakeBinaryDictionary(String srcFilename, String bigramSrcFilename, String destFilename){
+ System.out.println("Generating dictionary version " + VERSION_NUM);
+ bigramDict = new BigramDictionary(bigramSrcFilename, (bigramSrcFilename != null));
populateDictionary(srcFilename);
writeToDict(destFilename);
- // Enable the code below to verify that the generated tree is traversable.
+
+ // Enable the code below to verify that the generated tree is traversable
+ // and bigram data is stored correctly.
if (false) {
- traverseDict(0, new char[32], 0);
+ bigramDict.reverseLookupAll(mDictionary, dict);
+ traverseDict(2, new char[32], 0);
}
}
-
+
private void populateDictionary(String filename) {
roots = new ArrayList<CharNode>();
+ mDictionary = new HashMap<String, Integer>();
try {
SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
parser.parse(new File(filename), new DefaultHandler() {
@@ -205,8 +233,11 @@
private void addCount(int count) {
dict[dictSize++] = (byte) (0xFF & count);
}
-
- private void addNode(CharNode node) {
+
+ private void addNode(CharNode node, String word1) {
+ if (node.terminal) { // store address of each word1
+ mDictionary.put(word1, dictSize);
+ }
int charData = 0xFFFF & node.data;
if (charData > 254) {
dict[dictSize++] = (byte) 255;
@@ -226,6 +257,15 @@
if (node.terminal) {
byte freq = (byte) (0xFF & node.freq);
dict[dictSize++] = freq;
+ // bigram
+ if (bigramDict.mBi.containsKey(word1)) {
+ int count = bigramDict.mBi.get(word1).count;
+ bigramDict.mBigramToFill.add(word1);
+ bigramDict.mBigramToFillAddress.add(dictSize);
+ dictSize += (4 * count);
+ } else {
+ dict[dictSize++] = (byte) (0x00);
+ }
}
}
@@ -255,24 +295,27 @@
}
}
- void writeWordsRec(List<CharNode> children) {
+ void writeWordsRec(List<CharNode> children, StringBuilder word) {
if (children == null || children.size() == 0) {
return;
}
final int childCount = children.size();
addCount(childCount);
- //int childrenStart = dictSize;
int[] childrenAddresses = new int[childCount];
for (int j = 0; j < childCount; j++) {
CharNode node = children.get(j);
childrenAddresses[j] = dictSize;
- addNode(node);
+ word.append(children.get(j).data);
+ addNode(node, word.toString());
+ word.deleteCharAt(word.length()-1);
}
for (int j = 0; j < childCount; j++) {
CharNode node = children.get(j);
int nodeAddress = childrenAddresses[j];
int cacheDictSize = dictSize;
- writeWordsRec(node.children);
+ word.append(children.get(j).data);
+ writeWordsRec(node.children, word);
+ word.deleteCharAt(word.length()-1);
updateNodeAddress(nodeAddress, node, node.children != null
? cacheDictSize : 0);
}
@@ -284,7 +327,13 @@
// < 1MB in most cases, as there is a limit in the
// resource size in apks.
dictSize = 0;
- writeWordsRec(roots);
+
+ dict[dictSize++] = (byte) (0xFF & VERSION_NUM); // version info
+ dict[dictSize++] = (byte) (0xFF & (bigramDict.mHasBigram ? 1 : 0));
+
+ StringBuilder word = new StringBuilder(48);
+ writeWordsRec(roots, word);
+ dict = bigramDict.writeBigrams(dict, mDictionary);
System.out.println("Dict Size = " + dictSize);
try {
FileOutputStream fos = new FileOutputStream(dictFilename);
@@ -299,24 +348,36 @@
int count = dict[pos++] & 0xFF;
for (int i = 0; i < count; i++) {
char c = (char) (dict[pos++] & 0xFF);
- if (c == 0xFF) {
+ if (c == 0xFF) { // two byte character
c = (char) (((dict[pos] & 0xFF) << 8) | (dict[pos+1] & 0xFF));
pos += 2;
}
word[depth] = c;
- boolean terminal = (dict[pos] & 0x80) > 0;
+ boolean terminal = getFirstBitOfByte(pos, dict);
int address = 0;
- if ((dict[pos] & (FLAG_ADDRESS_MASK >> 16)) > 0) {
- address =
- ((dict[pos + 0] & (FLAG_ADDRESS_MASK >> 16)) << 16)
- | ((dict[pos + 1] & 0xFF) << 8)
- | ((dict[pos + 2] & 0xFF));
- pos += 2;
+ if ((dict[pos] & (FLAG_ADDRESS_MASK >> 16)) > 0) { // address check
+ address = get22BitAddress(pos, dict);
+ pos += 3;
+ } else {
+ pos += 1;
}
- pos++;
if (terminal) {
showWord(word, depth + 1, dict[pos] & 0xFF);
pos++;
+
+ int bigramExist = (dict[pos] & bigramDict.FLAG_BIGRAM_READ);
+ if (bigramExist > 0) {
+ int nextBigramExist = 1;
+ while (nextBigramExist > 0) {
+ int bigramAddress = get22BitAddress(pos, dict);
+ pos += 3;
+ int frequency = (bigramDict.FLAG_BIGRAM_FREQ & dict[pos]);
+ bigramDict.searchForTerminalNode(bigramAddress, frequency, dict);
+ nextBigramExist = (dict[pos++] & bigramDict.FLAG_BIGRAM_CONTINUED);
+ }
+ } else {
+ pos++;
+ }
}
if (address != 0) {
traverseDict(address, word, depth + 1);
@@ -327,4 +388,18 @@
void showWord(char[] word, int size, int freq) {
System.out.print(new String(word, 0, size) + " " + freq + "\n");
}
+
+ static int get22BitAddress(int pos, byte[] dict) {
+ return ((dict[pos + 0] & 0x3F) << 16)
+ | ((dict[pos + 1] & 0xFF) << 8)
+ | ((dict[pos + 2] & 0xFF));
+ }
+
+ static boolean getFirstBitOfByte(int pos, byte[] dict) {
+ return (dict[pos] & 0x80) > 0;
+ }
+
+ static boolean getSecondBitOfByte(int pos, byte[] dict) {
+ return (dict[pos] & 0x40) > 0;
+ }
}