blob: 56ede7e175d7a23769d11afd652432ff76f813d6 [file] [log] [blame]
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.android.tools.lint.checks;
import static com.android.SdkConstants.DOT_XML;
import static com.android.tools.lint.detector.api.LintUtils.assertionsEnabled;
import com.android.annotations.NonNull;
import com.android.annotations.Nullable;
import com.android.annotations.VisibleForTesting;
import com.android.tools.lint.client.api.LintClient;
import com.android.tools.lint.detector.api.LintUtils;
import com.google.common.base.Charsets;
import com.google.common.base.Splitter;
import com.google.common.io.Files;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel.MapMode;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.WeakHashMap;
/**
* Database of common typos / misspellings.
*/
public class TypoLookup {
private static final TypoLookup NONE = new TypoLookup();
/** String separating misspellings and suggested replacements in the text file */
private static final String WORD_SEPARATOR = "->"; //$NON-NLS-1$
/** Relative path to the typos database file within the Lint installation */
private static final String XML_FILE_PATH = "tools/support/typos-%1$s.txt"; //$NON-NLS-1$
private static final String FILE_HEADER = "Typo database used by Android lint\000";
private static final int BINARY_FORMAT_VERSION = 2;
private static final boolean DEBUG_FORCE_REGENERATE_BINARY = false;
private static final boolean DEBUG_SEARCH = false;
private static final boolean WRITE_STATS = false;
/** Default size to reserve for each API entry when creating byte buffer to build up data */
private static final int BYTES_PER_ENTRY = 28;
private byte[] mData;
private int[] mIndices;
private int mWordCount;
private static final WeakHashMap<String, TypoLookup> sInstanceMap =
new WeakHashMap<String, TypoLookup>();
/**
* Returns an instance of the Typo database for the given locale
*
* @param client the client to associate with this database - used only for
* logging. The database object may be shared among repeated
* invocations, and in that case client used will be the one
* originally passed in. In other words, this parameter may be
* ignored if the client created is not new.
* @param locale the locale to look up a typo database for (should be a
* language code (ISO 639-1, two lowercase character names)
* @param region the region to look up a typo database for (should be a two
* letter ISO 3166-1 alpha-2 country code in upper case) language
* code
* @return a (possibly shared) instance of the typo database, or null if its
* data can't be found
*/
@Nullable
public static TypoLookup get(@NonNull LintClient client, @NonNull String locale,
@Nullable String region) {
synchronized (TypoLookup.class) {
String key = locale;
if (region != null && region.length() == 2) { // skip BCP-47 regions
// Allow for region-specific dictionaries. See for example
// http://en.wikipedia.org/wiki/American_and_British_English_spelling_differences
assert region.length() == 2
&& Character.isUpperCase(region.charAt(0))
&& Character.isUpperCase(region.charAt(1)) : region;
// Look for typos-en-rUS.txt etc
key = locale + 'r' + region;
}
TypoLookup db = sInstanceMap.get(key);
if (db == null) {
String path = String.format(XML_FILE_PATH, key);
File file = client.findResource(path);
if (file == null) {
// AOSP build environment?
String build = System.getenv("ANDROID_BUILD_TOP"); //$NON-NLS-1$
if (build != null) {
file = new File(build, ("sdk/files/" //$NON-NLS-1$
+ path.substring(path.lastIndexOf('/') + 1))
.replace('/', File.separatorChar));
}
}
if (file == null || !file.exists()) {
//noinspection VariableNotUsedInsideIf
if (region != null) {
// Fall back to the generic locale (non-region-specific) database
return get(client, locale, null);
}
db = NONE;
} else {
db = get(client, file);
assert db != null : file;
}
sInstanceMap.put(key, db);
}
if (db == NONE) {
return null;
} else {
return db;
}
}
}
/**
* Returns an instance of the typo database
*
* @param client the client to associate with this database - used only for
* logging
* @param xmlFile the XML file containing configuration data to use for this
* database
* @return a (possibly shared) instance of the typo database, or null
* if its data can't be found
*/
@Nullable
private static TypoLookup get(LintClient client, File xmlFile) {
if (!xmlFile.exists()) {
client.log(null, "The typo database file %1$s does not exist", xmlFile);
return null;
}
String name = xmlFile.getName();
if (LintUtils.endsWith(name, DOT_XML)) {
name = name.substring(0, name.length() - DOT_XML.length());
}
File cacheDir = client.getCacheDir(true/*create*/);
if (cacheDir == null) {
cacheDir = xmlFile.getParentFile();
}
File binaryData = new File(cacheDir, name
// Incorporate version number in the filename to avoid upgrade filename
// conflicts on Windows (such as issue #26663)
+ '-' + BINARY_FORMAT_VERSION + ".bin"); //$NON-NLS-1$
if (DEBUG_FORCE_REGENERATE_BINARY) {
System.err.println("\nTemporarily regenerating binary data unconditionally \nfrom "
+ xmlFile + "\nto " + binaryData);
if (!createCache(client, xmlFile, binaryData)) {
return null;
}
} else if (!binaryData.exists() || binaryData.lastModified() < xmlFile.lastModified()) {
if (!createCache(client, xmlFile, binaryData)) {
return null;
}
}
if (!binaryData.exists()) {
client.log(null, "The typo database file %1$s does not exist", binaryData);
return null;
}
return new TypoLookup(client, xmlFile, binaryData);
}
private static boolean createCache(LintClient client, File xmlFile, File binaryData) {
long begin = 0;
if (WRITE_STATS) {
begin = System.currentTimeMillis();
}
// Read in data
List<String> lines;
try {
lines = Files.readLines(xmlFile, Charsets.UTF_8);
} catch (IOException e) {
client.log(e, "Can't read typo database file");
return false;
}
if (WRITE_STATS) {
long end = System.currentTimeMillis();
System.out.println("Reading data structures took " + (end - begin) + " ms)");
}
try {
writeDatabase(binaryData, lines);
return true;
} catch (IOException ioe) {
client.log(ioe, "Can't write typo cache file");
}
return false;
}
/** Use one of the {@link #get} factory methods instead */
private TypoLookup(
@NonNull LintClient client,
@NonNull File xmlFile,
@Nullable File binaryFile) {
if (binaryFile != null) {
readData(client, xmlFile, binaryFile);
}
}
private TypoLookup() {
}
private void readData(@NonNull LintClient client, @NonNull File xmlFile,
@NonNull File binaryFile) {
if (!binaryFile.exists()) {
client.log(null, "%1$s does not exist", binaryFile);
return;
}
long start = System.currentTimeMillis();
try {
MappedByteBuffer buffer = Files.map(binaryFile, MapMode.READ_ONLY);
assert buffer.order() == ByteOrder.BIG_ENDIAN;
// First skip the header
byte[] expectedHeader = FILE_HEADER.getBytes(Charsets.US_ASCII);
buffer.rewind();
for (int offset = 0; offset < expectedHeader.length; offset++) {
if (expectedHeader[offset] != buffer.get()) {
client.log(null, "Incorrect file header: not an typo database cache " +
"file, or a corrupt cache file");
return;
}
}
// Read in the format number
if (buffer.get() != BINARY_FORMAT_VERSION) {
// Force regeneration of new binary data with up to date format
if (createCache(client, xmlFile, binaryFile)) {
readData(client, xmlFile, binaryFile); // Recurse
}
return;
}
mWordCount = buffer.getInt();
// Read in the word table indices;
int count = mWordCount;
int[] offsets = new int[count];
// Another idea: I can just store the DELTAS in the file (and add them up
// when reading back in) such that it takes just ONE byte instead of four!
for (int i = 0; i < count; i++) {
offsets[i] = buffer.getInt();
}
// No need to read in the rest -- we'll just keep the whole byte array in memory
// TODO: Make this code smarter/more efficient.
int size = buffer.limit();
byte[] b = new byte[size];
buffer.rewind();
buffer.get(b);
mData = b;
mIndices = offsets;
// TODO: We only need to keep the data portion here since we've initialized
// the offset array separately.
// TODO: Investigate (profile) accessing the byte buffer directly instead of
// accessing a byte array.
} catch (IOException e) {
client.log(e, null);
}
if (WRITE_STATS) {
long end = System.currentTimeMillis();
System.out.println("\nRead typo database in " + (end - start)
+ " milliseconds.");
System.out.println("Size of data table: " + mData.length + " bytes ("
+ Integer.toString(mData.length/1024) + "k)\n");
}
}
/** See the {@link #readData(LintClient,File,File)} for documentation on the data format. */
private static void writeDatabase(File file, List<String> lines) throws IOException {
/*
* 1. A file header, which is the exact contents of {@link FILE_HEADER} encoded
* as ASCII characters. The purpose of the header is to identify what the file
* is for, for anyone attempting to open the file.
* 2. A file version number. If the binary file does not match the reader's expected
* version, it can ignore it (and regenerate the cache from XML).
*/
// Drop comments etc
List<String> words = new ArrayList<String>(lines.size());
for (String line : lines) {
if (!line.isEmpty() && Character.isLetter(line.charAt(0))) {
int end = line.indexOf(WORD_SEPARATOR);
if (end == -1) {
end = line.trim().length();
}
String typo = line.substring(0, end).trim();
String replacements = line.substring(end + WORD_SEPARATOR.length()).trim();
if (replacements.isEmpty()) {
// We don't support empty replacements
continue;
}
String combined = typo + (char) 0 + replacements;
words.add(combined);
}
}
byte[][] wordArrays = new byte[words.size()][];
for (int i = 0, n = words.size(); i < n; i++) {
String word = words.get(i);
wordArrays[i] = word.getBytes(Charsets.UTF_8);
}
// Sort words, using our own comparator to ensure that it matches the
// binary search in getTypos()
Comparator<byte[]> comparator = new Comparator<byte[]>() {
@Override
public int compare(byte[] o1, byte[] o2) {
return TypoLookup.compare(o1, 0, (byte) 0, o2, 0, o2.length);
}
};
Arrays.sort(wordArrays, comparator);
byte[] headerBytes = FILE_HEADER.getBytes(Charsets.US_ASCII);
int entryCount = wordArrays.length;
int capacity = entryCount * BYTES_PER_ENTRY + headerBytes.length + 5;
ByteBuffer buffer = ByteBuffer.allocate(capacity);
buffer.order(ByteOrder.BIG_ENDIAN);
// 1. A file header, which is the exact contents of {@link FILE_HEADER} encoded
// as ASCII characters. The purpose of the header is to identify what the file
// is for, for anyone attempting to open the file.
buffer.put(headerBytes);
// 2. A file version number. If the binary file does not match the reader's expected
// version, it can ignore it (and regenerate the cache from XML).
buffer.put((byte) BINARY_FORMAT_VERSION);
// 3. The number of words [1 int]
buffer.putInt(entryCount);
// 4. Word offset table (one integer per word, pointing to the byte offset in the
// file (relative to the beginning of the file) where each word begins.
// The words are always sorted alphabetically.
int wordOffsetTable = buffer.position();
// Reserve enough room for the offset table here: we will backfill it with pointers
// as we're writing out the data structures below
for (int i = 0, n = entryCount; i < n; i++) {
buffer.putInt(0);
}
int nextEntry = buffer.position();
int nextOffset = wordOffsetTable;
// 7. Word entry table. Each word entry consists of the word, followed by the byte 0
// as a terminator, followed by a comma separated list of suggestions (which
// may be empty), or a final 0.
for (int i = 0; i < entryCount; i++) {
byte[] word = wordArrays[i];
buffer.position(nextOffset);
buffer.putInt(nextEntry);
nextOffset = buffer.position();
buffer.position(nextEntry);
buffer.put(word); // already embeds 0 to separate typo from words
buffer.put((byte) 0);
nextEntry = buffer.position();
}
int size = buffer.position();
assert size <= buffer.limit();
buffer.mark();
if (WRITE_STATS) {
System.out.println("Wrote " + words.size() + " word entries");
System.out.print("Actual binary size: " + size + " bytes");
System.out.println(String.format(" (%.1fM)", size/(1024*1024.f)));
System.out.println("Allocated size: " + (entryCount * BYTES_PER_ENTRY) + " bytes");
System.out.println("Required bytes per entry: " + (size/ entryCount) + " bytes");
}
// Now dump this out as a file
// There's probably an API to do this more efficiently; TODO: Look into this.
byte[] b = new byte[size];
buffer.rewind();
buffer.get(b);
FileOutputStream output = Files.newOutputStreamSupplier(file).getOutput();
output.write(b);
output.close();
}
// For debugging only
private String dumpEntry(int offset) {
if (DEBUG_SEARCH) {
int end = offset;
while (mData[end] != 0) {
end++;
}
return new String(mData, offset, end - offset, Charsets.UTF_8);
} else {
return "<disabled>"; //$NON-NLS-1$
}
}
/** Comparison function: *only* used for ASCII strings */
@VisibleForTesting
static int compare(byte[] data, int offset, byte terminator, CharSequence s,
int begin, int end) {
int i = offset;
int j = begin;
for (; ; i++, j++) {
byte b = data[i];
if (b == ' ') {
// We've matched up to the space in a split-word typo, such as
// in German all zu=>allzu; here we've matched just past "all".
// Rather than terminating, attempt to continue in the buffer.
if (j == end) {
int max = s.length();
if (end < max && s.charAt(end) == ' ') {
// Find next word
for (; end < max; end++) {
char c = s.charAt(end);
if (!Character.isLetter(c)) {
if (c == ' ' && end == j) {
continue;
}
break;
}
}
}
}
}
if (j == end) {
break;
}
if (b == '*') {
// Glob match (only supported at the end)
return 0;
}
char c = s.charAt(j);
byte cb = (byte) c;
int delta = b - cb;
if (delta != 0) {
cb = (byte) Character.toLowerCase(c);
if (b != cb) {
// Ensure that it has the right sign
b = (byte) Character.toLowerCase(b);
delta = b - cb;
if (delta != 0) {
return delta;
}
}
}
}
return data[i] - terminator;
}
/** Comparison function used for general UTF-8 encoded strings */
@VisibleForTesting
static int compare(byte[] data, int offset, byte terminator, byte[] s,
int begin, int end) {
int i = offset;
int j = begin;
for (; ; i++, j++) {
byte b = data[i];
if (b == ' ') {
// We've matched up to the space in a split-word typo, such as
// in German all zu=>allzu; here we've matched just past "all".
// Rather than terminating, attempt to continue in the buffer.
// We've matched up to the space in a split-word typo, such as
// in German all zu=>allzu; here we've matched just past "all".
// Rather than terminating, attempt to continue in the buffer.
if (j == end) {
int max = s.length;
if (end < max && s[end] == ' ') {
// Find next word
for (; end < max; end++) {
byte cb = s[end];
if (!isLetter(cb)) {
if (cb == ' ' && end == j) {
continue;
}
break;
}
}
}
}
}
if (j == end) {
break;
}
if (b == '*') {
// Glob match (only supported at the end)
return 0;
}
byte cb = s[j];
int delta = b - cb;
if (delta != 0) {
cb = toLowerCase(cb);
b = toLowerCase(b);
delta = b - cb;
if (delta != 0) {
return delta;
}
}
if (b == terminator || cb == terminator) {
return delta;
}
}
return data[i] - terminator;
}
/**
* Look up whether this word is a typo, and if so, return the typo itself
* and one or more likely meanings
*
* @param text the string containing the word
* @param begin the index of the first character in the word
* @param end the index of the first character after the word. Note that the
* search may extend <b>beyond</b> this index, if for example the
* word matches a multi-word typo in the dictionary
* @return a list of the typo itself followed by the replacement strings if
* the word represents a typo, and null otherwise
*/
@Nullable
public List<String> getTypos(@NonNull CharSequence text, int begin, int end) {
assert end <= text.length();
if (assertionsEnabled()) {
for (int i = begin; i < end; i++) {
char c = text.charAt(i);
if (c >= 128) {
assert false : "Call the UTF-8 version of this method instead";
return null;
}
}
}
int low = 0;
int high = mWordCount - 1;
while (low <= high) {
int middle = (low + high) >>> 1;
int offset = mIndices[middle];
if (DEBUG_SEARCH) {
System.out.println("Comparing string " + text +" with entry at " + offset
+ ": " + dumpEntry(offset));
}
// Compare the word at the given index.
int compare = compare(mData, offset, (byte) 0, text, begin, end);
if (compare == 0) {
offset = mIndices[middle];
// Don't allow matching uncapitalized words, such as "enlish", when
// the dictionary word is capitalized, "Enlish".
if (mData[offset] != text.charAt(begin)
&& Character.isLowerCase(text.charAt(begin))) {
return null;
}
// Make sure there is a case match; we only want to allow
// matching capitalized words to capitalized typos or uncapitalized typos
// (e.g. "Teh" and "teh" to "the"), but not uncapitalized words to capitalized
// typos (e.g. "enlish" to "Enlish").
String glob = null;
for (int i = begin; ; i++) {
byte b = mData[offset++];
if (b == 0) {
offset--;
break;
} else if (b == '*') {
int globEnd = i;
while (globEnd < text.length()
&& Character.isLetter(text.charAt(globEnd))) {
globEnd++;
}
glob = text.subSequence(i, globEnd).toString();
break;
}
char c = text.charAt(i);
byte cb = (byte) c;
if (b != cb && i > begin) {
return null;
}
}
return computeSuggestions(mIndices[middle], offset, glob);
}
if (compare < 0) {
low = middle + 1;
} else if (compare > 0) {
high = middle - 1;
} else {
assert false; // compare == 0 already handled above
return null;
}
}
return null;
}
/**
* Look up whether this word is a typo, and if so, return the typo itself
* and one or more likely meanings
*
* @param utf8Text the string containing the word, encoded as UTF-8
* @param begin the index of the first character in the word
* @param end the index of the first character after the word. Note that the
* search may extend <b>beyond</b> this index, if for example the
* word matches a multi-word typo in the dictionary
* @return a list of the typo itself followed by the replacement strings if
* the word represents a typo, and null otherwise
*/
@Nullable
public List<String> getTypos(@NonNull byte[] utf8Text, int begin, int end) {
assert end <= utf8Text.length;
int low = 0;
int high = mWordCount - 1;
while (low <= high) {
int middle = (low + high) >>> 1;
int offset = mIndices[middle];
if (DEBUG_SEARCH) {
String s = new String(Arrays.copyOfRange(utf8Text, begin, end), Charsets.UTF_8);
System.out.println("Comparing string " + s +" with entry at " + offset
+ ": " + dumpEntry(offset));
System.out.println(" middle=" + middle + ", low=" + low + ", high=" + high);
}
// Compare the word at the given index.
int compare = compare(mData, offset, (byte) 0, utf8Text, begin, end);
if (DEBUG_SEARCH) {
System.out.println(" signum=" + (int)Math.signum(compare) + ", delta=" + compare);
}
if (compare == 0) {
offset = mIndices[middle];
// Don't allow matching uncapitalized words, such as "enlish", when
// the dictionary word is capitalized, "Enlish".
if (mData[offset] != utf8Text[begin] && isUpperCase(mData[offset])) {
return null;
}
// Make sure there is a case match; we only want to allow
// matching capitalized words to capitalized typos or uncapitalized typos
// (e.g. "Teh" and "teh" to "the"), but not uncapitalized words to capitalized
// typos (e.g. "enlish" to "Enlish").
String glob = null;
for (int i = begin; ; i++) {
byte b = mData[offset++];
if (b == 0) {
offset--;
break;
} else if (b == '*') {
int globEnd = i;
while (globEnd < utf8Text.length && isLetter(utf8Text[globEnd])) {
globEnd++;
}
glob = new String(utf8Text, i, globEnd - i, Charsets.UTF_8);
break;
}
byte cb = utf8Text[i];
if (b != cb && i > begin) {
return null;
}
}
return computeSuggestions(mIndices[middle], offset, glob);
}
if (compare < 0) {
low = middle + 1;
} else if (compare > 0) {
high = middle - 1;
} else {
assert false; // compare == 0 already handled above
return null;
}
}
return null;
}
private List<String> computeSuggestions(int begin, int offset, String glob) {
String typo = new String(mData, begin, offset - begin, Charsets.UTF_8);
if (glob != null) {
typo = typo.replaceAll("\\*", glob); //$NON-NLS-1$
}
assert mData[offset] == 0;
offset++;
int replacementEnd = offset;
while (mData[replacementEnd] != 0) {
replacementEnd++;
}
String replacements = new String(mData, offset, replacementEnd - offset, Charsets.UTF_8);
List<String> words = new ArrayList<String>();
words.add(typo);
// The first entry should be the typo itself. We need to pass this back since due
// to multi-match words and globbing it could extend beyond the initial word range
for (String s : Splitter.on(',').omitEmptyStrings().trimResults().split(replacements)) {
if (glob != null) {
// Need to append the glob string to each result
words.add(s.replaceAll("\\*", glob)); //$NON-NLS-1$
} else {
words.add(s);
}
}
return words;
}
// "Character" handling for bytes. This assumes that the bytes correspond to Unicode
// characters in the ISO 8859-1 range, which is are encoded the same way in UTF-8.
// This obviously won't work to for example uppercase to lowercase conversions for
// multi byte characters, which means we simply won't catch typos if the dictionaries
// contain these. None of the currently included dictionaries do. However, it does
// help us properly deal with punctuation and spacing characters.
static boolean isUpperCase(byte b) {
return Character.isUpperCase((char) b);
}
static byte toLowerCase(byte b) {
return (byte) Character.toLowerCase((char) b);
}
static boolean isSpace(byte b) {
return Character.isWhitespace((char) b);
}
static boolean isLetter(byte b) {
// Assume that multi byte characters represent letters in other languages.
// Obviously, it could be unusual punctuation etc but letters are more likely
// in this context.
return Character.isLetter((char) b) || (b & 0x80) != 0;
}
}