Generate English Dictionary (version : 200 - may contain bigrams)
 - created separate class for bigram

Change-Id: Iaee544c6e62dcd0a90730d20d5c955280e5cf0ae
diff --git a/tools/makedict/src/com/android/tools/dict/BigramDictionary.java b/tools/makedict/src/com/android/tools/dict/BigramDictionary.java
new file mode 100644
index 0000000..35115bf
--- /dev/null
+++ b/tools/makedict/src/com/android/tools/dict/BigramDictionary.java
@@ -0,0 +1,286 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.tools.dict;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+/**
+ * Helper for MakeBinaryDictionary
+ * Deals with all the bigram data
+ */
+public class BigramDictionary {
+
+    /*
+     * Must match the values in the client side which is located in dictionary.cpp & dictionary.h
+     * Changing these values will generate totally different structure which must be also reflected
+     * on the client side.
+     */
+    public static final int FLAG_BIGRAM_READ = 0x80;
+    public static final int FLAG_BIGRAM_CHILDEXIST = 0x40;
+    public static final int FLAG_BIGRAM_CONTINUED = 0x80;
+    public static final int FLAG_BIGRAM_FREQ = 0x7F;
+
+    public static final int FOR_REVERSE_LOOKUPALL = -99;
+
+    public ArrayList<String> mBigramToFill = new ArrayList<String>();
+    public ArrayList<Integer> mBigramToFillAddress = new ArrayList<Integer>();
+
+    public HashMap<String, Bigram> mBi;
+
+    public boolean mHasBigram;
+
+    public BigramDictionary(String bigramSrcFilename, boolean hasBigram) {
+        mHasBigram = hasBigram;
+        loadBigram(bigramSrcFilename);
+    }
+
+    private void loadBigram(String filename) {
+        mBi = new HashMap<String, Bigram>();
+        if (!mHasBigram) {
+            System.out.println("Number of bigrams = " + Bigram.sBigramNum);
+            return;
+        }
+        try {
+            SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
+            parser.parse(new File(filename), new DefaultHandler() {
+                String w1 = null;
+                boolean inWord1 = false;
+                boolean inWord2 = false;
+                int freq = 0, counter = 0;
+                Bigram tempBigram = null;
+
+                @Override
+                public void startElement(String uri, String localName,
+                        String qName, Attributes attributes) {
+                    if (qName.equals("bi")) {
+                        inWord1 = true;
+                        w1 = attributes.getValue(0);
+                        int count = Integer.parseInt(attributes.getValue(1));
+                        tempBigram = new Bigram(count);
+                        counter = 0;
+                    } else if (qName.equals("w")) {
+                        inWord2 = true;
+                        String word2 = attributes.getValue(0);
+                        int freq = Integer.parseInt(attributes.getValue(1));
+                        tempBigram.setWord2(counter, word2, freq);
+                        counter++;
+                        Bigram.sBigramNum++;
+                    }
+                }
+
+                @Override
+                public void endElement(String uri, String localName,
+                        String qName) {
+                    if (inWord2) {
+                        inWord2 = false;
+                    } else if (inWord1) {
+                        inWord1 = false;
+                        mBi.put(w1, tempBigram);
+                    }
+                }
+            });
+        } catch (Exception ioe) {
+            System.err.println("Exception in parsing bigram\n" + ioe);
+            ioe.printStackTrace();
+        }
+        System.out.println("Number of bigrams = " + Bigram.sBigramNum);
+    }
+
+    byte[] writeBigrams(byte[] dict, Map<String, Integer> mDictionary) {
+        for (int i = 0; i < mBigramToFill.size(); i++) {
+            String w1 = mBigramToFill.get(i);
+            int address = mBigramToFillAddress.get(i);
+
+            Bigram temp = mBi.get(w1);
+            int word2Count = temp.count;
+            int j4;
+            for (int j = 0; j < word2Count; j++) {
+                if (!mDictionary.containsKey(temp.word2[j])) {
+                    System.out.println("Not in dictionary: " + temp.word2[j]);
+                    System.exit(0);
+                } else {
+                    j4 = (j * 4);
+                    int addressOfWord2 = mDictionary.get(temp.word2[j]);
+                    dict[address + j4 + 0] = (byte) (((addressOfWord2 & 0x3F0000) >> 16)
+                            | FLAG_BIGRAM_READ);
+                    dict[address + j4 + 1] = (byte) ((addressOfWord2 & 0x00FF00) >> 8);
+                    dict[address + j4 + 2] = (byte) ((addressOfWord2 & 0x0000FF));
+
+                    if (j == (word2Count - 1)) {
+                        dict[address + j4 + 3] = (byte) (temp.freq[j] & FLAG_BIGRAM_FREQ);
+                    } else {
+                        dict[address + j4 + 3] = (byte) ((temp.freq[j] & FLAG_BIGRAM_FREQ)
+                                | FLAG_BIGRAM_CONTINUED);
+                    }
+                }
+            }
+        }
+
+        return dict;
+    }
+
+    void reverseLookupAll(Map<String, Integer> mDictionary, byte[] dict) {
+        Set<String> st = mDictionary.keySet();
+        for (String s : st) {
+            searchForTerminalNode(mDictionary.get(s), FOR_REVERSE_LOOKUPALL, dict);
+        }
+    }
+
+    void searchForTerminalNode(int bigramAddress, int frequency, byte[] dict) {
+        StringBuilder sb = new StringBuilder(48);
+        int pos;
+        boolean found = false;
+        int followDownBranchAddress = 2;
+        char followingChar = ' ';
+        int depth = 0;
+        int totalLoopCount = 0;
+
+        while (!found) {
+            boolean followDownAddressSearchStop = false;
+            boolean firstAddress = true;
+            boolean haveToSearchAll = true;
+
+            if (depth > 0) {
+                sb.append(followingChar);
+            }
+            pos = followDownBranchAddress; // pos start at count
+            int count = dict[pos] & 0xFF;
+            pos++;
+            for (int i = 0; i < count; i++) {
+                totalLoopCount++;
+                // pos at data
+                pos++;
+                // pos now at flag
+                if (!MakeBinaryDictionary.getFirstBitOfByte(pos, dict)) { // non-terminal
+                    if (!followDownAddressSearchStop) {
+                        int addr = MakeBinaryDictionary.get22BitAddress(pos, dict);
+                        if (addr > bigramAddress) {
+                            followDownAddressSearchStop = true;
+                            if (firstAddress) {
+                                firstAddress = false;
+                                haveToSearchAll = true;
+                            } else if (!haveToSearchAll) {
+                                break;
+                            }
+                        } else {
+                            followDownBranchAddress = addr;
+                            followingChar = (char) (0xFF & dict[pos-1]);
+                            if(firstAddress) {
+                                firstAddress = false;
+                                haveToSearchAll = false;
+                            }
+                        }
+                    }
+                    pos += 3;
+                } else if (MakeBinaryDictionary.getFirstBitOfByte(pos, dict)) { // terminal
+                    // found !!
+                    if (bigramAddress == (pos-1)) {
+                        sb.append((char) (0xFF & dict[pos-1]));
+                        found = true;
+                        break;
+                    }
+
+                    // address + freq (4 byte)
+                    if (MakeBinaryDictionary.getSecondBitOfByte(pos, dict)) {
+                        if (!followDownAddressSearchStop) {
+                            int addr = MakeBinaryDictionary.get22BitAddress(pos, dict);
+                            if (addr > bigramAddress) {
+                                followDownAddressSearchStop = true;
+                                if (firstAddress) {
+                                    firstAddress = false;
+                                    haveToSearchAll = true;
+                                } else if (!haveToSearchAll) {
+                                    break;
+                                }
+                            } else {
+                                followDownBranchAddress = addr;
+                                followingChar = (char) (0xFF & dict[pos-1]);
+                                if(firstAddress) {
+                                    firstAddress = false;
+                                    haveToSearchAll = true;
+                                }
+                            }
+                        }
+                        pos += 4;
+                    } else { // freq only (2 byte)
+                        pos += 2;
+                    }
+                    // skipping bigram
+                    int bigramExist = (dict[pos] & FLAG_BIGRAM_READ);
+                    if (bigramExist > 0) {
+                        int nextBigramExist = 1;
+                        while (nextBigramExist > 0) {
+                            pos += 3;
+                            nextBigramExist = (dict[pos++] & FLAG_BIGRAM_CONTINUED);
+                        }
+                    } else {
+                        pos++;
+                    }
+                }
+            }
+            depth++;
+            if (followDownBranchAddress == 2) {
+                System.out.println("ERROR!!! Cannot find bigram!!");
+                System.exit(0);
+            }
+        }
+
+        if (frequency == FOR_REVERSE_LOOKUPALL) {
+            System.out.println("Reverse: " + sb.toString() + " (" + bigramAddress + ")"
+                    + "   Loop: " + totalLoopCount);
+        } else {
+            System.out.println("   bigram: " + sb.toString() + " (" + bigramAddress + ") freq: "
+                    + frequency + "   Loop: " + totalLoopCount);
+        }
+    }
+
+    static class Bigram {
+        String[] word2;
+        int[] freq;
+        int count;
+        static int sBigramNum = 0;
+
+        String getSecondWord(int i) {
+            return word2[i];
+        }
+
+        int getFrequency(int i) {
+            return (freq[i] == 0) ? 1 : freq[i];
+        }
+
+        void setWord2(int index, String word2, int freq) {
+            this.word2[index] = word2;
+            this.freq[index] = freq;
+        }
+
+        public Bigram(int word2Count) {
+            count = word2Count;
+            word2 = new String[word2Count];
+            freq = new int[word2Count];
+        }
+    }
+}
diff --git a/tools/makedict/src/com/android/tools/dict/MakeBinaryDictionary.java b/tools/makedict/src/com/android/tools/dict/MakeBinaryDictionary.java
index 77a6401..0627879 100755
--- a/tools/makedict/src/com/android/tools/dict/MakeBinaryDictionary.java
+++ b/tools/makedict/src/com/android/tools/dict/MakeBinaryDictionary.java
@@ -31,15 +31,24 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;
 
 /**
- * Compresses a list of words and frequencies into a tree structured binary dictionary.
+ * Compresses a list of words, frequencies, and bigram data
+ * into a tree structured binary dictionary.
+ * Dictionary Version: 200 (may contain bigrams)
+ *  Version number started from 200 rather than 1 because we wanted to prevent number of roots in
+ *  any old dictionaries being mistaken as the version number. There is not a chance that there
+ *  will be more than 200 roots. Version number should be increased when there is structural change
+ *  in the data. There is no need to increase the version when only the words in the data changes.
  */
 public class MakeBinaryDictionary {
 
+    private static final int VERSION_NUM = 200;
+
     public static final int ALPHA_SIZE = 256;
 
     public static final String TAG_WORD = "w";
@@ -48,13 +57,15 @@
     private static final int FLAG_ADDRESS_MASK  = 0x400000;
     private static final int FLAG_TERMINAL_MASK = 0x800000;
     private static final int ADDRESS_MASK = 0x3FFFFF;
-    
+
     public static final CharNode EMPTY_NODE = new CharNode();
 
     List<CharNode> roots;
     Map<String, Integer> mDictionary;
     int mWordCount;
-    
+
+    BigramDictionary bigramDict;
+
     static class CharNode {
         char data;
         int freq;
@@ -68,29 +79,46 @@
     }
 
     public static void usage() {
-        System.err.println("Usage: makedict <src.xml> <dest.dict>");
+        System.err.println("Usage: makedict -s <src_dict.xml> [-b <src_bigram.xml>] "
+                + "-d <dest.dict>");
         System.exit(-1);
     }
     
     public static void main(String[] args) {
-        if (args.length < 2) {
-            usage();
+        int checkSource = -1;
+        int checkBigram = -1;
+        int checkDest = -1;
+        for (int i = 0; i < args.length; i+=2) {
+            if (args[i].equals("-s")) checkSource = (i + 1);
+            if (args[i].equals("-b")) checkBigram = (i + 1);
+            if (args[i].equals("-d")) checkDest = (i + 1);
+        }
+        if (checkSource >= 0 && checkBigram >= 0 && checkDest >= 0 && args.length == 6) {
+            new MakeBinaryDictionary(args[checkSource], args[checkBigram], args[checkDest]);
+        } else if (checkSource >= 0 && checkDest >= 0 && args.length == 4) {
+            new MakeBinaryDictionary(args[checkSource], null, args[checkDest]);
         } else {
-            new MakeBinaryDictionary(args[0], args[1]);
+            usage();
         }
     }
 
-    public MakeBinaryDictionary(String srcFilename, String destFilename) {
+    public MakeBinaryDictionary(String srcFilename, String bigramSrcFilename, String destFilename){
+        System.out.println("Generating dictionary version " + VERSION_NUM);
+        bigramDict = new BigramDictionary(bigramSrcFilename, (bigramSrcFilename != null));
         populateDictionary(srcFilename);
         writeToDict(destFilename);
-        // Enable the code below to verify that the generated tree is traversable.
+
+        // Enable the code below to verify that the generated tree is traversable
+        // and bigram data is stored correctly.
         if (false) {
-            traverseDict(0, new char[32], 0);
+            bigramDict.reverseLookupAll(mDictionary, dict);
+            traverseDict(2, new char[32], 0);
         }
     }
-    
+
     private void populateDictionary(String filename) {
         roots = new ArrayList<CharNode>();
+        mDictionary = new HashMap<String, Integer>();
         try {
             SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
             parser.parse(new File(filename), new DefaultHandler() {
@@ -205,8 +233,11 @@
     private void addCount(int count) {
         dict[dictSize++] = (byte) (0xFF & count);
     }
-    
-    private void addNode(CharNode node) {
+
+    private void addNode(CharNode node, String word1) {
+        if (node.terminal) { // store address of each word1
+            mDictionary.put(word1, dictSize);
+        }
         int charData = 0xFFFF & node.data;
         if (charData > 254) {
             dict[dictSize++] = (byte) 255;
@@ -226,6 +257,15 @@
         if (node.terminal) {
             byte freq = (byte) (0xFF & node.freq);
             dict[dictSize++] = freq;
+            // bigram
+            if (bigramDict.mBi.containsKey(word1)) {
+                int count = bigramDict.mBi.get(word1).count;
+                bigramDict.mBigramToFill.add(word1);
+                bigramDict.mBigramToFillAddress.add(dictSize);
+                dictSize += (4 * count);
+            } else {
+                dict[dictSize++] = (byte) (0x00);
+            }
         }
     }
 
@@ -255,24 +295,27 @@
         }
     }
 
-    void writeWordsRec(List<CharNode> children) {
+    void writeWordsRec(List<CharNode> children, StringBuilder word) {
         if (children == null || children.size() == 0) {
             return;
         }
         final int childCount = children.size();
         addCount(childCount);
-        //int childrenStart = dictSize;
         int[] childrenAddresses = new int[childCount];
         for (int j = 0; j < childCount; j++) {
             CharNode node = children.get(j);
             childrenAddresses[j] = dictSize;
-            addNode(node);
+            word.append(children.get(j).data);
+            addNode(node, word.toString());
+            word.deleteCharAt(word.length()-1);
         }
         for (int j = 0; j < childCount; j++) {
             CharNode node = children.get(j);
             int nodeAddress = childrenAddresses[j];
             int cacheDictSize = dictSize;
-            writeWordsRec(node.children);
+            word.append(children.get(j).data);
+            writeWordsRec(node.children, word);
+            word.deleteCharAt(word.length()-1);
             updateNodeAddress(nodeAddress, node, node.children != null
                     ? cacheDictSize : 0);
         }
@@ -284,7 +327,13 @@
                                           // < 1MB in most cases, as there is a limit in the
                                           // resource size in apks.
         dictSize = 0;
-        writeWordsRec(roots);
+
+        dict[dictSize++] = (byte) (0xFF & VERSION_NUM); // version info
+        dict[dictSize++] = (byte) (0xFF & (bigramDict.mHasBigram ? 1 : 0));
+
+        StringBuilder word = new StringBuilder(48);
+        writeWordsRec(roots, word);
+        dict = bigramDict.writeBigrams(dict, mDictionary);
         System.out.println("Dict Size = " + dictSize);
         try {
             FileOutputStream fos = new FileOutputStream(dictFilename);
@@ -299,24 +348,36 @@
         int count = dict[pos++] & 0xFF;
         for (int i = 0; i < count; i++) {
             char c = (char) (dict[pos++] & 0xFF);
-            if (c == 0xFF) {
+            if (c == 0xFF) { // two byte character
                 c = (char) (((dict[pos] & 0xFF) << 8) | (dict[pos+1] & 0xFF));
                 pos += 2;
             }
             word[depth] = c;
-            boolean terminal = (dict[pos] & 0x80) > 0;
+            boolean terminal = getFirstBitOfByte(pos, dict);
             int address = 0;
-            if ((dict[pos] & (FLAG_ADDRESS_MASK >> 16)) > 0) {
-                address = 
-                    ((dict[pos + 0] & (FLAG_ADDRESS_MASK >> 16)) << 16)
-                    | ((dict[pos + 1] & 0xFF) << 8)
-                    | ((dict[pos + 2] & 0xFF));
-                pos += 2;
+            if ((dict[pos] & (FLAG_ADDRESS_MASK >> 16)) > 0) { // address check
+                address = get22BitAddress(pos, dict);
+                pos += 3;
+            } else {
+                pos += 1;
             }
-            pos++;
             if (terminal) {
                 showWord(word, depth + 1, dict[pos] & 0xFF);
                 pos++;
+
+                int bigramExist = (dict[pos] & bigramDict.FLAG_BIGRAM_READ);
+                if (bigramExist > 0) {
+                    int nextBigramExist = 1;
+                    while (nextBigramExist > 0) {
+                        int bigramAddress = get22BitAddress(pos, dict);
+                        pos += 3;
+                        int frequency = (bigramDict.FLAG_BIGRAM_FREQ & dict[pos]);
+                        bigramDict.searchForTerminalNode(bigramAddress, frequency, dict);
+                        nextBigramExist = (dict[pos++] & bigramDict.FLAG_BIGRAM_CONTINUED);
+                    }
+                } else {
+                    pos++;
+                }
             }
             if (address != 0) {
                 traverseDict(address, word, depth + 1);
@@ -327,4 +388,18 @@
     void showWord(char[] word, int size, int freq) {
         System.out.print(new String(word, 0, size) + " " + freq + "\n");
     }
+
+    static int get22BitAddress(int pos, byte[] dict) {
+        return ((dict[pos + 0] & 0x3F) << 16)
+                | ((dict[pos + 1] & 0xFF) << 8)
+                | ((dict[pos + 2] & 0xFF));
+    }
+
+    static boolean getFirstBitOfByte(int pos, byte[] dict) {
+        return (dict[pos] & 0x80) > 0;
+    }
+
+    static boolean getSecondBitOfByte(int pos, byte[] dict) {
+        return (dict[pos] & 0x40) > 0;
+    }
 }