| /* |
| * Copyright (C) 2011 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); you may not |
| * use this file except in compliance with the License. You may obtain a copy of |
| * the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
| * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
| * License for the specific language governing permissions and limitations under |
| * the License. |
| */ |
| |
| package com.android.inputmethod.latin.dicttool; |
| |
| import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions; |
| import com.android.inputmethod.latin.makedict.FusionDictionary; |
| import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; |
| import com.android.inputmethod.latin.makedict.ProbabilityInfo; |
| import com.android.inputmethod.latin.makedict.WeightedString; |
| import com.android.inputmethod.latin.makedict.WordProperty; |
| |
| import org.xml.sax.Attributes; |
| import org.xml.sax.SAXException; |
| import org.xml.sax.helpers.DefaultHandler; |
| |
| import java.io.BufferedInputStream; |
| import java.io.BufferedReader; |
| import java.io.BufferedWriter; |
| import java.io.FileInputStream; |
| import java.io.IOException; |
| import java.io.InputStreamReader; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.TreeSet; |
| |
| import javax.xml.parsers.ParserConfigurationException; |
| import javax.xml.parsers.SAXParser; |
| import javax.xml.parsers.SAXParserFactory; |
| |
| /** |
| * Reads and writes XML files for a FusionDictionary. |
| * |
| * All functions in this class are static. |
| */ |
| public class XmlDictInputOutput { |
| |
| private static final String ROOT_TAG = "wordlist"; |
| private static final String WORD_TAG = "w"; |
| private static final String BIGRAM_TAG = "bigram"; |
| private static final String SHORTCUT_TAG = "shortcut"; |
| private static final String PROBABILITY_ATTR = "f"; |
| private static final String WORD_ATTR = "word"; |
| private static final String NOT_A_WORD_ATTR = "not_a_word"; |
| |
| /** |
| * SAX handler for a unigram XML file. |
| */ |
| static private class UnigramHandler extends DefaultHandler { |
| // Parser states |
| private static final int START = 1; |
| private static final int WORD = 2; |
| private static final int UNKNOWN = 3; |
| private static final int SHORTCUT_ONLY_WORD_PROBABILITY = 1; |
| |
| FusionDictionary mDictionary; |
| int mState; // the state of the parser |
| int mFreq; // the currently read freq |
| String mWord; // the current word |
| final HashMap<String, ArrayList<WeightedString>> mShortcutsMap; |
| |
| /** |
| * Create the handler. |
| * |
| * @param shortcuts the shortcuts as a map. This may be empty, but may not be null. |
| */ |
| public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts) { |
| mDictionary = null; |
| mShortcutsMap = shortcuts; |
| mWord = ""; |
| mState = START; |
| mFreq = 0; |
| } |
| |
| public FusionDictionary getFinalDictionary() { |
| final FusionDictionary dict = mDictionary; |
| for (final String shortcutOnly : mShortcutsMap.keySet()) { |
| if (dict.hasWord(shortcutOnly)) continue; |
| dict.add(shortcutOnly, new ProbabilityInfo(SHORTCUT_ONLY_WORD_PROBABILITY), |
| mShortcutsMap.get(shortcutOnly), true /* isNotAWord */); |
| } |
| mDictionary = null; |
| mShortcutsMap.clear(); |
| mWord = ""; |
| mState = START; |
| mFreq = 0; |
| return dict; |
| } |
| |
| @Override |
| public void startElement(String uri, String localName, String qName, Attributes attrs) { |
| if (WORD_TAG.equals(localName)) { |
| mState = WORD; |
| mWord = ""; |
| for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) { |
| final String attrName = attrs.getLocalName(attrIndex); |
| if (PROBABILITY_ATTR.equals(attrName)) { |
| mFreq = Integer.parseInt(attrs.getValue(attrIndex)); |
| } |
| } |
| } else if (ROOT_TAG.equals(localName)) { |
| final HashMap<String, String> attributes = new HashMap<>(); |
| for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) { |
| final String attrName = attrs.getLocalName(attrIndex); |
| attributes.put(attrName, attrs.getValue(attrIndex)); |
| } |
| mDictionary = new FusionDictionary(new PtNodeArray(), |
| new DictionaryOptions(attributes)); |
| } else { |
| mState = UNKNOWN; |
| } |
| } |
| |
| @Override |
| public void characters(char[] ch, int start, int length) { |
| if (WORD == mState) { |
| // The XML parser is free to return text in arbitrary chunks one after the |
| // other. In particular, this happens in some implementations when it finds |
| // an escape code like "&". |
| mWord += String.copyValueOf(ch, start, length); |
| } |
| } |
| |
| @Override |
| public void endElement(String uri, String localName, String qName) { |
| if (WORD == mState) { |
| mDictionary.add(mWord, new ProbabilityInfo(mFreq), mShortcutsMap.get(mWord), |
| false /* isNotAWord */); |
| mState = START; |
| } |
| } |
| } |
| |
| static private class AssociativeListHandler extends DefaultHandler { |
| private final String SRC_TAG; |
| private final String SRC_ATTRIBUTE; |
| private final String DST_TAG; |
| private final String DST_ATTRIBUTE; |
| private final String DST_FREQ; |
| |
| // In this version of the XML file, the bigram frequency is given as an int 0..XML_MAX |
| private final static int XML_MAX = 256; |
| // In memory and in the binary dictionary the bigram frequency is 0..MEMORY_MAX |
| private final static int MEMORY_MAX = 256; |
| private final static int XML_TO_MEMORY_RATIO = XML_MAX / MEMORY_MAX; |
| |
| private String mSrc; |
| private final HashMap<String, ArrayList<WeightedString>> mAssocMap; |
| |
| public AssociativeListHandler(final String srcTag, final String srcAttribute, |
| final String dstTag, final String dstAttribute, final String dstFreq) { |
| SRC_TAG = srcTag; |
| SRC_ATTRIBUTE = srcAttribute; |
| DST_TAG = dstTag; |
| DST_ATTRIBUTE = dstAttribute; |
| DST_FREQ = dstFreq; |
| mSrc = null; |
| mAssocMap = new HashMap<>(); |
| } |
| |
| @Override |
| public void startElement(String uri, String localName, String qName, Attributes attrs) { |
| if (SRC_TAG.equals(localName)) { |
| mSrc = attrs.getValue(uri, SRC_ATTRIBUTE); |
| } else if (DST_TAG.equals(localName)) { |
| String dst = attrs.getValue(uri, DST_ATTRIBUTE); |
| int freq = getValueFromFreqString(attrs.getValue(uri, DST_FREQ)); |
| WeightedString bigram = new WeightedString(dst, freq / XML_TO_MEMORY_RATIO); |
| ArrayList<WeightedString> bigramList = mAssocMap.get(mSrc); |
| if (null == bigramList) bigramList = new ArrayList<>(); |
| bigramList.add(bigram); |
| mAssocMap.put(mSrc, bigramList); |
| } |
| } |
| |
| protected int getValueFromFreqString(final String freqString) { |
| return Integer.parseInt(freqString); |
| } |
| |
| // This may return an empty map, but will never return null. |
| public HashMap<String, ArrayList<WeightedString>> getAssocMap() { |
| return mAssocMap; |
| } |
| } |
| |
| /** |
| * SAX handler for a bigram XML file. |
| */ |
| static private class BigramHandler extends AssociativeListHandler { |
| private final static String BIGRAM_W1_TAG = "bi"; |
| private final static String BIGRAM_W2_TAG = "w"; |
| private final static String BIGRAM_W1_ATTRIBUTE = "w1"; |
| private final static String BIGRAM_W2_ATTRIBUTE = "w2"; |
| private final static String BIGRAM_FREQ_ATTRIBUTE = "p"; |
| |
| public BigramHandler() { |
| super(BIGRAM_W1_TAG, BIGRAM_W1_ATTRIBUTE, BIGRAM_W2_TAG, BIGRAM_W2_ATTRIBUTE, |
| BIGRAM_FREQ_ATTRIBUTE); |
| } |
| |
| // As per getAssocMap(), this never returns null. |
| public HashMap<String, ArrayList<WeightedString>> getBigramMap() { |
| return getAssocMap(); |
| } |
| } |
| |
| /** |
| * SAX handler for a shortcut & whitelist XML file. |
| */ |
| static private class ShortcutAndWhitelistHandler extends AssociativeListHandler { |
| private final static String ENTRY_TAG = "entry"; |
| private final static String ENTRY_ATTRIBUTE = "shortcut"; |
| private final static String TARGET_TAG = "target"; |
| private final static String REPLACEMENT_ATTRIBUTE = "replacement"; |
| private final static String TARGET_PRIORITY_ATTRIBUTE = "priority"; |
| private final static String WHITELIST_MARKER = "whitelist"; |
| private final static int WHITELIST_FREQ_VALUE = 15; |
| private final static int MIN_FREQ = 0; |
| private final static int MAX_FREQ = 14; |
| |
| public ShortcutAndWhitelistHandler() { |
| super(ENTRY_TAG, ENTRY_ATTRIBUTE, TARGET_TAG, REPLACEMENT_ATTRIBUTE, |
| TARGET_PRIORITY_ATTRIBUTE); |
| } |
| |
| @Override |
| protected int getValueFromFreqString(final String freqString) { |
| if (WHITELIST_MARKER.equals(freqString)) { |
| return WHITELIST_FREQ_VALUE; |
| } |
| final int intValue = super.getValueFromFreqString(freqString); |
| if (intValue < MIN_FREQ || intValue > MAX_FREQ) { |
| throw new RuntimeException("Shortcut freq out of range. Accepted range is " |
| + MIN_FREQ + ".." + MAX_FREQ); |
| } |
| return intValue; |
| } |
| |
| // As per getAssocMap(), this never returns null. |
| public HashMap<String, ArrayList<WeightedString>> getShortcutAndWhitelistMap() { |
| return getAssocMap(); |
| } |
| } |
| |
| /** |
| * Basic test to find out whether the file is in the unigram XML format or not. |
| * |
| * Concretely this only tests the header line. |
| * |
| * @param filename The name of the file to test. |
| * @return true if the file is in the unigram XML format, false otherwise |
| */ |
| public static boolean isXmlUnigramDictionary(final String filename) { |
| try (final BufferedReader reader = new BufferedReader( |
| new InputStreamReader(new FileInputStream(filename), "UTF-8"))) { |
| final String firstLine = reader.readLine(); |
| return firstLine.matches("^\\s*<wordlist .*>\\s*$"); |
| } catch (final IOException e) { |
| return false; |
| } |
| } |
| |
| /** |
| * Reads a dictionary from an XML file. |
| * |
| * This is the public method that will parse an XML file and return the corresponding memory |
| * representation. |
| * |
| * @param unigrams the file to read the data from. |
| * @param shortcuts the file to read the shortcuts & whitelist from, or null. |
| * @param bigrams the file to read the bigrams from, or null. |
| * @return the in-memory representation of the dictionary. |
| */ |
| public static FusionDictionary readDictionaryXml(final BufferedInputStream unigrams, |
| final BufferedInputStream shortcuts, final BufferedInputStream bigrams) |
| throws SAXException, IOException, ParserConfigurationException { |
| final SAXParserFactory factory = SAXParserFactory.newInstance(); |
| factory.setNamespaceAware(true); |
| final SAXParser parser = factory.newSAXParser(); |
| final BigramHandler bigramHandler = new BigramHandler(); |
| if (null != bigrams) parser.parse(bigrams, bigramHandler); |
| |
| final ShortcutAndWhitelistHandler shortcutAndWhitelistHandler = |
| new ShortcutAndWhitelistHandler(); |
| if (null != shortcuts) parser.parse(shortcuts, shortcutAndWhitelistHandler); |
| |
| final UnigramHandler unigramHandler = |
| new UnigramHandler(shortcutAndWhitelistHandler.getShortcutAndWhitelistMap()); |
| parser.parse(unigrams, unigramHandler); |
| final FusionDictionary dict = unigramHandler.getFinalDictionary(); |
| final HashMap<String, ArrayList<WeightedString>> bigramMap = bigramHandler.getBigramMap(); |
| for (final String firstWord : bigramMap.keySet()) { |
| if (!dict.hasWord(firstWord)) continue; |
| final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord); |
| for (final WeightedString bigram : bigramList) { |
| if (!dict.hasWord(bigram.mWord)) continue; |
| dict.setBigram(firstWord, bigram.mWord, bigram.mProbabilityInfo); |
| } |
| } |
| return dict; |
| } |
| |
| /** |
| * Reads a dictionary in the first, legacy XML format |
| * |
| * This method reads data from the parser and creates a new FusionDictionary with it. |
| * The format parsed by this method is the format used before Ice Cream Sandwich, |
| * which has no support for bigrams or shortcuts/whitelist. |
| * It is important to note that this method expects the parser to have already eaten |
| * the first, all-encompassing tag. |
| * |
| * @param xpp the parser to read the data from. |
| * @return the parsed dictionary. |
| */ |
| |
| /** |
| * Writes a dictionary to an XML file. |
| * |
| * The output format is the "second" format, which supports bigrams and shortcuts/whitelist. |
| * |
| * @param destination a destination stream to write to. |
| * @param dict the dictionary to write. |
| */ |
| public static void writeDictionaryXml(final BufferedWriter destination, |
| final FusionDictionary dict) throws IOException { |
| final TreeSet<WordProperty> wordPropertiesInDict = new TreeSet<>(); |
| for (WordProperty wordProperty : dict) { |
| wordPropertiesInDict.add(wordProperty); |
| } |
| // TODO: use an XMLSerializer if this gets big |
| destination.write("<wordlist format=\"2\""); |
| for (final String key : dict.mOptions.mAttributes.keySet()) { |
| final String value = dict.mOptions.mAttributes.get(key); |
| destination.write(" " + key + "=\"" + value + "\""); |
| } |
| destination.write(">\n"); |
| destination.write("<!-- Warning: there is no code to read this format yet. -->\n"); |
| for (WordProperty wordProperty : wordPropertiesInDict) { |
| destination.write(" <" + WORD_TAG + " " + WORD_ATTR + "=\"" + wordProperty.mWord |
| + "\" " + PROBABILITY_ATTR + "=\"" + wordProperty.getProbability() |
| + (wordProperty.mIsNotAWord ? "\" " + NOT_A_WORD_ATTR + "=\"true" : "") |
| + "\">"); |
| if (null != wordProperty.mShortcutTargets) { |
| destination.write("\n"); |
| for (WeightedString target : wordProperty.mShortcutTargets) { |
| destination.write(" <" + SHORTCUT_TAG + " " + PROBABILITY_ATTR + "=\"" |
| + target.getProbability() + "\">" + target.mWord + "</" + SHORTCUT_TAG |
| + ">\n"); |
| } |
| destination.write(" "); |
| } |
| if (null != wordProperty.mBigrams) { |
| destination.write("\n"); |
| for (WeightedString bigram : wordProperty.mBigrams) { |
| destination.write(" <" + BIGRAM_TAG + " " + PROBABILITY_ATTR + "=\"" |
| + bigram.getProbability() + "\">" + bigram.mWord |
| + "</" + BIGRAM_TAG + ">\n"); |
| } |
| destination.write(" "); |
| } |
| destination.write("</" + WORD_TAG + ">\n"); |
| } |
| destination.write("</wordlist>\n"); |
| destination.close(); |
| } |
| } |