blob: 5e7aca5bba3e35cfcf1643fadb3d2d1734dc4102 [file] [log] [blame]
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.android.inputmethod.latin.dicttool;
import com.android.inputmethod.latin.makedict.FormatSpec;
import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
import com.android.inputmethod.latin.makedict.ProbabilityInfo;
import com.android.inputmethod.latin.makedict.WeightedString;
import com.android.inputmethod.latin.makedict.WordProperty;
import com.android.inputmethod.latin.utils.CombinedFormatUtils;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.TreeSet;
/**
* Reads and writes combined format for a FusionDictionary.
*
* All functions in this class are static.
*/
public class CombinedInputOutput {
private static final String WHITELIST_TAG = "whitelist";
private static final String OPTIONS_TAG = "options";
private static final String COMMENT_LINE_STARTER = "#";
private static final int HISTORICAL_INFO_ELEMENT_COUNT = 3;
/**
* Basic test to find out whether the file is in the combined format or not.
*
* Concretely this only tests the header line.
*
* @param filename The name of the file to test.
* @return true if the file is in the combined format, false otherwise
*/
public static boolean isCombinedDictionary(final String filename) {
try (final BufferedReader reader = new BufferedReader(new FileReader(filename))) {
String firstLine = reader.readLine();
while (firstLine.startsWith(COMMENT_LINE_STARTER)) {
firstLine = reader.readLine();
}
return firstLine.matches(
"^" + CombinedFormatUtils.DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*");
} catch (final IOException e) {
return false;
}
}
/**
* Reads a dictionary from a combined format file.
*
* This is the public method that will read a combined file and return the corresponding memory
* representation.
*
* @param reader the buffered reader to read the data from.
* @return the in-memory representation of the dictionary.
*/
public static FusionDictionary readDictionaryCombined(final BufferedReader reader)
throws IOException {
String headerLine = reader.readLine();
while (headerLine.startsWith(COMMENT_LINE_STARTER)) {
headerLine = reader.readLine();
}
final String header[] = headerLine.split(",");
final HashMap<String, String> attributes = new HashMap<>();
for (String item : header) {
final String keyValue[] = item.split("=");
if (2 != keyValue.length) {
throw new RuntimeException("Wrong header format : " + headerLine);
}
attributes.put(keyValue[0], keyValue[1]);
}
attributes.remove(OPTIONS_TAG);
final FusionDictionary dict =
new FusionDictionary(new PtNodeArray(), new DictionaryOptions(attributes));
String line;
String word = null;
ProbabilityInfo probabilityInfo = new ProbabilityInfo(0);
boolean isNotAWord = false;
boolean isPossiblyOffensive = false;
ArrayList<WeightedString> bigrams = new ArrayList<>();
ArrayList<WeightedString> shortcuts = new ArrayList<>();
while (null != (line = reader.readLine())) {
if (line.startsWith(COMMENT_LINE_STARTER)) continue;
final String args[] = line.trim().split(",");
if (args[0].matches(CombinedFormatUtils.WORD_TAG + "=.*")) {
if (null != word) {
dict.add(word, probabilityInfo, isNotAWord, isPossiblyOffensive);
for (WeightedString s : bigrams) {
dict.setBigram(word, s.mWord, s.mProbabilityInfo);
}
}
if (!shortcuts.isEmpty()) shortcuts = new ArrayList<>();
if (!bigrams.isEmpty()) bigrams = new ArrayList<>();
isNotAWord = false;
isPossiblyOffensive = false;
for (String param : args) {
final String params[] = param.split("=", 2);
if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
switch (params[0]) {
case CombinedFormatUtils.WORD_TAG:
word = params[1];
break;
case CombinedFormatUtils.PROBABILITY_TAG:
probabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]),
probabilityInfo.mTimestamp, probabilityInfo.mLevel,
probabilityInfo.mCount);
break;
case CombinedFormatUtils.HISTORICAL_INFO_TAG:
final String[] historicalInfoParams = params[1].split(
CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR);
if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) {
throw new RuntimeException("Wrong format (historical info) : "
+ line);
}
probabilityInfo = new ProbabilityInfo(probabilityInfo.mProbability,
Integer.parseInt(historicalInfoParams[0]),
Integer.parseInt(historicalInfoParams[1]),
Integer.parseInt(historicalInfoParams[2]));
break;
case CombinedFormatUtils.NOT_A_WORD_TAG:
isNotAWord = CombinedFormatUtils.isLiteralTrue(params[1]);
break;
case CombinedFormatUtils.POSSIBLY_OFFENSIVE_TAG:
isPossiblyOffensive = CombinedFormatUtils.isLiteralTrue(params[1]);
break;
}
}
} else if (args[0].matches(CombinedFormatUtils.BIGRAM_TAG + "=.*")) {
String secondWordOfBigram = null;
ProbabilityInfo bigramProbabilityInfo = new ProbabilityInfo(0);
for (String param : args) {
final String params[] = param.split("=", 2);
if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
if (CombinedFormatUtils.BIGRAM_TAG.equals(params[0])) {
secondWordOfBigram = params[1];
} else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) {
bigramProbabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]),
bigramProbabilityInfo.mTimestamp, bigramProbabilityInfo.mLevel,
bigramProbabilityInfo.mCount);
} else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) {
final String[] historicalInfoParams =
params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR);
if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) {
throw new RuntimeException("Wrong format (historical info) : " + line);
}
bigramProbabilityInfo = new ProbabilityInfo(
bigramProbabilityInfo.mProbability,
Integer.parseInt(historicalInfoParams[0]),
Integer.parseInt(historicalInfoParams[1]),
Integer.parseInt(historicalInfoParams[2]));
}
}
if (null != secondWordOfBigram) {
bigrams.add(new WeightedString(secondWordOfBigram, bigramProbabilityInfo));
} else {
throw new RuntimeException("Wrong format : " + line);
}
}
}
if (null != word) {
dict.add(word, probabilityInfo, isNotAWord, isPossiblyOffensive);
for (WeightedString s : bigrams) {
dict.setBigram(word, s.mWord, s.mProbabilityInfo);
}
}
return dict;
}
/**
* Writes a dictionary to a combined file.
*
* @param destination a destination writer.
* @param dict the dictionary to write.
*/
public static void writeDictionaryCombined(final BufferedWriter destination,
final FusionDictionary dict) throws IOException {
final TreeSet<WordProperty> wordPropertiesInDict = new TreeSet<>();
for (final WordProperty wordProperty : dict) {
// This for ordering by frequency, then by asciibetic order
wordPropertiesInDict.add(wordProperty);
}
destination.write(CombinedFormatUtils.formatAttributeMap(dict.mOptions.mAttributes));
for (final WordProperty wordProperty : wordPropertiesInDict) {
destination.write(CombinedFormatUtils.formatWordProperty(wordProperty));
}
}
}