blob: 5dfb7bf1179b5859abbcf7e9275a8e35ab8c327c [file] [log] [blame]
/*
* Copyright (C) 2011 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.android.inputmethod.latin.dicttool;
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils;
import com.android.inputmethod.latin.makedict.BinaryDictIOUtils;
import com.android.inputmethod.latin.makedict.DictDecoder;
import com.android.inputmethod.latin.makedict.DictEncoder;
import com.android.inputmethod.latin.makedict.FormatSpec;
import com.android.inputmethod.latin.makedict.FusionDictionary;
import com.android.inputmethod.latin.makedict.MakedictLog;
import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
import com.android.inputmethod.latin.makedict.Ver2DictEncoder;
import com.android.inputmethod.latin.makedict.Ver4DictEncoder;
import org.xml.sax.SAXException;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.LinkedList;
import javax.xml.parsers.ParserConfigurationException;
/**
* Main class/method for DictionaryMaker.
*/
public class DictionaryMaker {
static class Arguments {
private static final String OPTION_VERSION_2 = "-2";
private static final String OPTION_VERSION_4 = "-4";
private static final String OPTION_INPUT_SOURCE = "-s";
private static final String OPTION_INPUT_BIGRAM_XML = "-b";
private static final String OPTION_INPUT_SHORTCUT_XML = "-c";
private static final String OPTION_OUTPUT_BINARY = "-d";
private static final String OPTION_OUTPUT_XML = "-x";
private static final String OPTION_OUTPUT_COMBINED = "-o";
private static final String OPTION_HELP = "-h";
private static final String OPTION_CODE_POINT_TABLE = "-t";
private static final String OPTION_CODE_POINT_TABLE_OFF = "off";
private static final String OPTION_CODE_POINT_TABLE_ON = "on";
public final String mInputBinary;
public final String mInputCombined;
public final String mInputUnigramXml;
public final String mInputShortcutXml;
public final String mInputBigramXml;
public final String mOutputBinary;
public final String mOutputXml;
public final String mOutputCombined;
public final int mOutputBinaryFormatVersion;
public final int mCodePointTableMode;
private void checkIntegrity() throws IOException {
checkHasExactlyOneInput();
checkHasAtLeastOneOutput();
checkNotSameFile(mInputBinary, mOutputBinary);
checkNotSameFile(mInputBinary, mOutputXml);
checkNotSameFile(mInputCombined, mOutputBinary);
checkNotSameFile(mInputCombined, mOutputXml);
checkNotSameFile(mInputUnigramXml, mOutputBinary);
checkNotSameFile(mInputUnigramXml, mOutputXml);
checkNotSameFile(mInputUnigramXml, mOutputCombined);
checkNotSameFile(mInputShortcutXml, mOutputBinary);
checkNotSameFile(mInputShortcutXml, mOutputXml);
checkNotSameFile(mInputShortcutXml, mOutputCombined);
checkNotSameFile(mInputBigramXml, mOutputBinary);
checkNotSameFile(mInputBigramXml, mOutputXml);
checkNotSameFile(mInputBigramXml, mOutputCombined);
checkNotSameFile(mOutputBinary, mOutputXml);
checkNotSameFile(mOutputBinary, mOutputCombined);
checkNotSameFile(mOutputXml, mOutputCombined);
}
private void checkHasExactlyOneInput() {
if (null == mInputUnigramXml && null == mInputBinary && null == mInputCombined) {
throw new RuntimeException("No input file specified");
} else if ((null != mInputUnigramXml && null != mInputBinary)
|| (null != mInputUnigramXml && null != mInputCombined)
|| (null != mInputBinary && null != mInputCombined)) {
throw new RuntimeException("Several input files specified");
} else if ((null != mInputBinary || null != mInputCombined)
&& (null != mInputBigramXml || null != mInputShortcutXml)) {
throw new RuntimeException("Separate bigrams/shortcut files are only supported"
+ " with XML input (other formats include bigrams and shortcuts already)");
}
}
private void checkHasAtLeastOneOutput() {
if (null == mOutputBinary && null == mOutputXml && null == mOutputCombined) {
throw new RuntimeException("No output specified");
}
}
/**
* Utility method that throws an exception if path1 and path2 point to the same file.
*/
private static void checkNotSameFile(final String path1, final String path2)
throws IOException {
if (null == path1 || null == path2) return;
if (new File(path1).getCanonicalPath().equals(new File(path2).getCanonicalPath())) {
throw new RuntimeException(path1 + " and " + path2 + " are the same file: "
+ " refusing to process.");
}
}
private void displayHelp() {
MakedictLog.i(getHelp());
}
public static String getHelp() {
return "Usage: makedict "
+ "[-s <unigrams.xml> [-b <bigrams.xml>] [-c <shortcuts_and_whitelist.xml>] "
+ "| [-s <combined format input]"
+ "| [-s <binary input>] [-d <binary output>] [-x <xml output>] "
+ " [-o <combined output>] [-t <code point table switch: on/off/auto>]"
+ "[-2] [-3] [-4]\n"
+ "\n"
+ " Converts a source dictionary file to one or several outputs.\n"
+ " Source can be an XML file, with an optional XML bigrams file, or a\n"
+ " binary dictionary file.\n"
+ " Binary version 2 (Jelly Bean), 3, 4, XML and\n"
+ " combined format outputs are supported.";
}
public Arguments(String[] argsArray) throws IOException {
final LinkedList<String> args = new LinkedList<>(Arrays.asList(argsArray));
if (args.isEmpty()) {
displayHelp();
}
String inputBinary = null;
String inputCombined = null;
String inputUnigramXml = null;
String inputShortcutXml = null;
String inputBigramXml = null;
String outputBinary = null;
String outputXml = null;
String outputCombined = null;
int outputBinaryFormatVersion = FormatSpec.VERSION201; // the default version is 201.
// Don't use code point table by default.
int codePointTableMode = Ver2DictEncoder.CODE_POINT_TABLE_OFF;
while (!args.isEmpty()) {
final String arg = args.get(0);
args.remove(0);
if (arg.charAt(0) == '-') {
if (OPTION_VERSION_2.equals(arg)) {
// Do nothing, this is the default
} else if (OPTION_VERSION_4.equals(arg)) {
outputBinaryFormatVersion = FormatSpec.VERSION4;
} else if (OPTION_HELP.equals(arg)) {
displayHelp();
} else {
// All these options need an argument
if (args.isEmpty()) {
throw new IllegalArgumentException("Option " + arg + " is unknown or "
+ "requires an argument");
}
String argValue = args.get(0);
args.remove(0);
if (OPTION_INPUT_SOURCE.equals(arg)) {
if (XmlDictInputOutput.isXmlUnigramDictionary(argValue)) {
inputUnigramXml = argValue;
} else if (CombinedInputOutput.isCombinedDictionary(argValue)) {
inputCombined = argValue;
} else if (BinaryDictDecoderUtils.isBinaryDictionary(argValue)) {
inputBinary = argValue;
} else {
throw new IllegalArgumentException(
"Unknown format for file " + argValue);
}
} else if (OPTION_INPUT_SHORTCUT_XML.equals(arg)) {
inputShortcutXml = argValue;
} else if (OPTION_INPUT_BIGRAM_XML.equals(arg)) {
inputBigramXml = argValue;
} else if (OPTION_OUTPUT_BINARY.equals(arg)) {
outputBinary = argValue;
} else if (OPTION_OUTPUT_XML.equals(arg)) {
outputXml = argValue;
} else if (OPTION_OUTPUT_COMBINED.equals(arg)) {
outputCombined = argValue;
} else if (OPTION_CODE_POINT_TABLE.equals(arg)) {
if (OPTION_CODE_POINT_TABLE_OFF.equals(argValue)) {
codePointTableMode = Ver2DictEncoder.CODE_POINT_TABLE_OFF;
} else if (OPTION_CODE_POINT_TABLE_ON.equals(argValue)) {
codePointTableMode = Ver2DictEncoder.CODE_POINT_TABLE_ON;
} else {
throw new IllegalArgumentException(
"Unknown argument to -t option : " + argValue);
}
} else {
throw new IllegalArgumentException("Unknown option : " + arg);
}
}
} else {
if (null == inputBinary && null == inputUnigramXml) {
if (BinaryDictDecoderUtils.isBinaryDictionary(arg)) {
inputBinary = arg;
} else if (CombinedInputOutput.isCombinedDictionary(arg)) {
inputCombined = arg;
} else {
inputUnigramXml = arg;
}
} else if (null == outputBinary) {
outputBinary = arg;
} else {
throw new IllegalArgumentException("Several output binary files specified");
}
}
}
mInputBinary = inputBinary;
mInputCombined = inputCombined;
mInputUnigramXml = inputUnigramXml;
mInputShortcutXml = inputShortcutXml;
mInputBigramXml = inputBigramXml;
mOutputBinary = outputBinary;
mOutputXml = outputXml;
mOutputCombined = outputCombined;
mOutputBinaryFormatVersion = outputBinaryFormatVersion;
mCodePointTableMode = codePointTableMode;
checkIntegrity();
}
}
public static void main(String[] args)
throws FileNotFoundException, ParserConfigurationException, SAXException, IOException,
UnsupportedFormatException {
final Arguments parsedArgs = new Arguments(args);
FusionDictionary dictionary = readInputFromParsedArgs(parsedArgs);
writeOutputToParsedArgs(parsedArgs, dictionary);
}
/**
* Invoke the right input method according to args.
*
* @param args the parsed command line arguments.
* @return the read dictionary.
*/
private static FusionDictionary readInputFromParsedArgs(final Arguments args)
throws IOException, UnsupportedFormatException, ParserConfigurationException,
SAXException, FileNotFoundException {
if (null != args.mInputBinary) {
return readBinaryFile(args.mInputBinary);
} else if (null != args.mInputCombined) {
return readCombinedFile(args.mInputCombined);
} else if (null != args.mInputUnigramXml) {
return readXmlFile(args.mInputUnigramXml, args.mInputShortcutXml, args.mInputBigramXml);
} else {
throw new RuntimeException("No input file specified");
}
}
/**
* Read a dictionary from the name of a binary file.
*
* @param binaryFilename the name of the file in the binary dictionary format.
* @return the read dictionary.
* @throws FileNotFoundException if the file can't be found
* @throws IOException if the input file can't be read
* @throws UnsupportedFormatException if the binary file is not in the expected format
*/
private static FusionDictionary readBinaryFile(final String binaryFilename)
throws FileNotFoundException, IOException, UnsupportedFormatException {
final File file = new File(binaryFilename);
final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length());
return dictDecoder.readDictionaryBinary(false /* deleteDictIfBroken */);
}
/**
* Read a dictionary from the name of a combined file.
*
* @param combinedFilename the name of the file in the combined format.
* @return the read dictionary.
* @throws FileNotFoundException if the file can't be found
* @throws IOException if the input file can't be read
*/
private static FusionDictionary readCombinedFile(final String combinedFilename)
throws FileNotFoundException, IOException {
try (final BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(combinedFilename), "UTF-8"))
) {
return CombinedInputOutput.readDictionaryCombined(reader);
}
}
private static BufferedInputStream getBufferedFileInputStream(final String filename)
throws FileNotFoundException {
if (filename == null) {
return null;
}
return new BufferedInputStream(new FileInputStream(filename));
}
/**
* Read a dictionary from a unigram XML file, and optionally a bigram XML file.
*
* @param unigramXmlFilename the name of the unigram XML file. May not be null.
* @param shortcutXmlFilename the name of the shortcut/whitelist XML file, or null if none.
* @param bigramXmlFilename the name of the bigram XML file. Pass null if there are no bigrams.
* @return the read dictionary.
* @throws FileNotFoundException if one of the files can't be found
* @throws SAXException if one or more of the XML files is not well-formed
* @throws IOException if one the input files can't be read
* @throws ParserConfigurationException if the system can't create a SAX parser
*/
private static FusionDictionary readXmlFile(final String unigramXmlFilename,
final String shortcutXmlFilename, final String bigramXmlFilename)
throws FileNotFoundException, SAXException, IOException, ParserConfigurationException {
try (
final BufferedInputStream unigrams = getBufferedFileInputStream(unigramXmlFilename);
final BufferedInputStream shortcuts = getBufferedFileInputStream(shortcutXmlFilename);
final BufferedInputStream bigrams = getBufferedFileInputStream(bigramXmlFilename);
) {
return XmlDictInputOutput.readDictionaryXml(unigrams, shortcuts, bigrams);
}
}
/**
* Invoke the right output method according to args.
*
* This will write the passed dictionary to the file(s) passed in the command line arguments.
* @param args the parsed arguments.
* @param dict the file to output.
* @throws FileNotFoundException if one of the output files can't be created.
* @throws IOException if one of the output files can't be written to.
*/
private static void writeOutputToParsedArgs(final Arguments args, final FusionDictionary dict)
throws FileNotFoundException, IOException, UnsupportedFormatException,
IllegalArgumentException {
if (null != args.mOutputBinary) {
writeBinaryDictionary(args.mOutputBinary, dict, args.mOutputBinaryFormatVersion,
args.mCodePointTableMode);
}
if (null != args.mOutputXml) {
writeXmlDictionary(args.mOutputXml, dict);
}
if (null != args.mOutputCombined) {
writeCombinedDictionary(args.mOutputCombined, dict);
}
}
/**
* Write the dictionary in binary format to the specified filename.
*
* @param outputFilename the name of the file to write to.
* @param dict the dictionary to write.
* @param version the binary format version to use.
* @param codePointTableMode the value to decide how we treat the code point table.
* @throws FileNotFoundException if the output file can't be created.
* @throws IOException if the output file can't be written to.
*/
private static void writeBinaryDictionary(final String outputFilename,
final FusionDictionary dict, final int version, final int codePointTableMode)
throws FileNotFoundException, IOException, UnsupportedFormatException {
final File outputFile = new File(outputFilename);
final FormatSpec.FormatOptions formatOptions = new FormatSpec.FormatOptions(version);
final DictEncoder dictEncoder;
if (version == FormatSpec.VERSION4) {
// VERSION4 doesn't use the code point table.
dictEncoder = new Ver4DictEncoder(outputFile);
} else {
dictEncoder = new Ver2DictEncoder(outputFile, codePointTableMode);
}
dictEncoder.writeDictionary(dict, formatOptions);
}
/**
* Write the dictionary in XML format to the specified filename.
*
* @param outputFilename the name of the file to write to.
* @param dict the dictionary to write.
* @throws FileNotFoundException if the output file can't be created.
* @throws IOException if the output file can't be written to.
*/
private static void writeXmlDictionary(final String outputFilename,
final FusionDictionary dict) throws FileNotFoundException, IOException {
try (final BufferedWriter writer = new BufferedWriter(new FileWriter(outputFilename))) {
XmlDictInputOutput.writeDictionaryXml(writer, dict);
}
}
/**
* Write the dictionary in the combined format to the specified filename.
*
* @param outputFilename the name of the file to write to.
* @param dict the dictionary to write.
* @throws FileNotFoundException if the output file can't be created.
* @throws IOException if the output file can't be written to.
*/
private static void writeCombinedDictionary(final String outputFilename,
final FusionDictionary dict) throws FileNotFoundException, IOException {
try (final BufferedWriter writer = new BufferedWriter(new FileWriter(outputFilename))) {
CombinedInputOutput.writeDictionaryCombined(writer, dict);
}
}
}