| /* |
| * Copyright (C) 2012 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package com.android.tools.lint.checks; |
| |
| import static com.android.SdkConstants.DOT_XML; |
| import static com.android.tools.lint.detector.api.LintUtils.assertionsEnabled; |
| |
| import com.android.annotations.NonNull; |
| import com.android.annotations.Nullable; |
| import com.android.annotations.VisibleForTesting; |
| import com.android.tools.lint.client.api.LintClient; |
| import com.android.tools.lint.detector.api.LintUtils; |
| import com.google.common.base.Charsets; |
| import com.google.common.base.Splitter; |
| import com.google.common.io.Files; |
| |
| import java.io.File; |
| import java.io.FileOutputStream; |
| import java.io.IOException; |
| import java.nio.ByteBuffer; |
| import java.nio.ByteOrder; |
| import java.nio.MappedByteBuffer; |
| import java.nio.channels.FileChannel.MapMode; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Comparator; |
| import java.util.List; |
| import java.util.WeakHashMap; |
| |
| /** |
| * Database of common typos / misspellings. |
| */ |
| public class TypoLookup { |
| private static final TypoLookup NONE = new TypoLookup(); |
| |
| /** String separating misspellings and suggested replacements in the text file */ |
| private static final String WORD_SEPARATOR = "->"; //$NON-NLS-1$ |
| |
| /** Relative path to the typos database file within the Lint installation */ |
| private static final String XML_FILE_PATH = "tools/support/typos-%1$s.txt"; //$NON-NLS-1$ |
| private static final String FILE_HEADER = "Typo database used by Android lint\000"; |
| private static final int BINARY_FORMAT_VERSION = 2; |
| private static final boolean DEBUG_FORCE_REGENERATE_BINARY = false; |
| private static final boolean DEBUG_SEARCH = false; |
| private static final boolean WRITE_STATS = false; |
| /** Default size to reserve for each API entry when creating byte buffer to build up data */ |
| private static final int BYTES_PER_ENTRY = 28; |
| |
| private byte[] mData; |
| private int[] mIndices; |
| private int mWordCount; |
| |
| private static final WeakHashMap<String, TypoLookup> sInstanceMap = |
| new WeakHashMap<String, TypoLookup>(); |
| |
| /** |
| * Returns an instance of the Typo database for the given locale |
| * |
| * @param client the client to associate with this database - used only for |
| * logging. The database object may be shared among repeated |
| * invocations, and in that case client used will be the one |
| * originally passed in. In other words, this parameter may be |
| * ignored if the client created is not new. |
| * @param locale the locale to look up a typo database for (should be a |
| * language code (ISO 639-1, two lowercase character names) |
| * @param region the region to look up a typo database for (should be a two |
| * letter ISO 3166-1 alpha-2 country code in upper case) language |
| * code |
| * @return a (possibly shared) instance of the typo database, or null if its |
| * data can't be found |
| */ |
| @Nullable |
| public static TypoLookup get(@NonNull LintClient client, @NonNull String locale, |
| @Nullable String region) { |
| synchronized (TypoLookup.class) { |
| String key = locale; |
| |
| if (region != null && region.length() == 2) { // skip BCP-47 regions |
| // Allow for region-specific dictionaries. See for example |
| // http://en.wikipedia.org/wiki/American_and_British_English_spelling_differences |
| assert region.length() == 2 |
| && Character.isUpperCase(region.charAt(0)) |
| && Character.isUpperCase(region.charAt(1)) : region; |
| // Look for typos-en-rUS.txt etc |
| key = locale + 'r' + region; |
| } |
| |
| TypoLookup db = sInstanceMap.get(key); |
| if (db == null) { |
| String path = String.format(XML_FILE_PATH, key); |
| File file = client.findResource(path); |
| if (file == null) { |
| // AOSP build environment? |
| String build = System.getenv("ANDROID_BUILD_TOP"); //$NON-NLS-1$ |
| if (build != null) { |
| file = new File(build, ("sdk/files/" //$NON-NLS-1$ |
| + path.substring(path.lastIndexOf('/') + 1)) |
| .replace('/', File.separatorChar)); |
| } |
| } |
| |
| if (file == null || !file.exists()) { |
| //noinspection VariableNotUsedInsideIf |
| if (region != null) { |
| // Fall back to the generic locale (non-region-specific) database |
| return get(client, locale, null); |
| } |
| db = NONE; |
| } else { |
| db = get(client, file); |
| assert db != null : file; |
| } |
| sInstanceMap.put(key, db); |
| } |
| |
| if (db == NONE) { |
| return null; |
| } else { |
| return db; |
| } |
| } |
| } |
| |
| /** |
| * Returns an instance of the typo database |
| * |
| * @param client the client to associate with this database - used only for |
| * logging |
| * @param xmlFile the XML file containing configuration data to use for this |
| * database |
| * @return a (possibly shared) instance of the typo database, or null |
| * if its data can't be found |
| */ |
| @Nullable |
| private static TypoLookup get(LintClient client, File xmlFile) { |
| if (!xmlFile.exists()) { |
| client.log(null, "The typo database file %1$s does not exist", xmlFile); |
| return null; |
| } |
| |
| String name = xmlFile.getName(); |
| if (LintUtils.endsWith(name, DOT_XML)) { |
| name = name.substring(0, name.length() - DOT_XML.length()); |
| } |
| File cacheDir = client.getCacheDir(true/*create*/); |
| if (cacheDir == null) { |
| cacheDir = xmlFile.getParentFile(); |
| } |
| |
| File binaryData = new File(cacheDir, name |
| // Incorporate version number in the filename to avoid upgrade filename |
| // conflicts on Windows (such as issue #26663) |
| + '-' + BINARY_FORMAT_VERSION + ".bin"); //$NON-NLS-1$ |
| |
| if (DEBUG_FORCE_REGENERATE_BINARY) { |
| System.err.println("\nTemporarily regenerating binary data unconditionally \nfrom " |
| + xmlFile + "\nto " + binaryData); |
| if (!createCache(client, xmlFile, binaryData)) { |
| return null; |
| } |
| } else if (!binaryData.exists() || binaryData.lastModified() < xmlFile.lastModified()) { |
| if (!createCache(client, xmlFile, binaryData)) { |
| return null; |
| } |
| } |
| |
| if (!binaryData.exists()) { |
| client.log(null, "The typo database file %1$s does not exist", binaryData); |
| return null; |
| } |
| |
| return new TypoLookup(client, xmlFile, binaryData); |
| } |
| |
| private static boolean createCache(LintClient client, File xmlFile, File binaryData) { |
| long begin = 0; |
| if (WRITE_STATS) { |
| begin = System.currentTimeMillis(); |
| } |
| |
| // Read in data |
| List<String> lines; |
| try { |
| lines = Files.readLines(xmlFile, Charsets.UTF_8); |
| } catch (IOException e) { |
| client.log(e, "Can't read typo database file"); |
| return false; |
| } |
| |
| if (WRITE_STATS) { |
| long end = System.currentTimeMillis(); |
| System.out.println("Reading data structures took " + (end - begin) + " ms)"); |
| } |
| |
| try { |
| writeDatabase(binaryData, lines); |
| return true; |
| } catch (IOException ioe) { |
| client.log(ioe, "Can't write typo cache file"); |
| } |
| |
| return false; |
| } |
| |
| /** Use one of the {@link #get} factory methods instead */ |
| private TypoLookup( |
| @NonNull LintClient client, |
| @NonNull File xmlFile, |
| @Nullable File binaryFile) { |
| if (binaryFile != null) { |
| readData(client, xmlFile, binaryFile); |
| } |
| } |
| |
| private TypoLookup() { |
| } |
| |
| private void readData(@NonNull LintClient client, @NonNull File xmlFile, |
| @NonNull File binaryFile) { |
| if (!binaryFile.exists()) { |
| client.log(null, "%1$s does not exist", binaryFile); |
| return; |
| } |
| long start = System.currentTimeMillis(); |
| try { |
| MappedByteBuffer buffer = Files.map(binaryFile, MapMode.READ_ONLY); |
| assert buffer.order() == ByteOrder.BIG_ENDIAN; |
| |
| // First skip the header |
| byte[] expectedHeader = FILE_HEADER.getBytes(Charsets.US_ASCII); |
| buffer.rewind(); |
| for (int offset = 0; offset < expectedHeader.length; offset++) { |
| if (expectedHeader[offset] != buffer.get()) { |
| client.log(null, "Incorrect file header: not an typo database cache " + |
| "file, or a corrupt cache file"); |
| return; |
| } |
| } |
| |
| // Read in the format number |
| if (buffer.get() != BINARY_FORMAT_VERSION) { |
| // Force regeneration of new binary data with up to date format |
| if (createCache(client, xmlFile, binaryFile)) { |
| readData(client, xmlFile, binaryFile); // Recurse |
| } |
| |
| return; |
| } |
| |
| mWordCount = buffer.getInt(); |
| |
| // Read in the word table indices; |
| int count = mWordCount; |
| int[] offsets = new int[count]; |
| |
| // Another idea: I can just store the DELTAS in the file (and add them up |
| // when reading back in) such that it takes just ONE byte instead of four! |
| |
| for (int i = 0; i < count; i++) { |
| offsets[i] = buffer.getInt(); |
| } |
| |
| // No need to read in the rest -- we'll just keep the whole byte array in memory |
| // TODO: Make this code smarter/more efficient. |
| int size = buffer.limit(); |
| byte[] b = new byte[size]; |
| buffer.rewind(); |
| buffer.get(b); |
| mData = b; |
| mIndices = offsets; |
| |
| // TODO: We only need to keep the data portion here since we've initialized |
| // the offset array separately. |
| // TODO: Investigate (profile) accessing the byte buffer directly instead of |
| // accessing a byte array. |
| } catch (IOException e) { |
| client.log(e, null); |
| } |
| if (WRITE_STATS) { |
| long end = System.currentTimeMillis(); |
| System.out.println("\nRead typo database in " + (end - start) |
| + " milliseconds."); |
| System.out.println("Size of data table: " + mData.length + " bytes (" |
| + Integer.toString(mData.length/1024) + "k)\n"); |
| } |
| } |
| |
| /** See the {@link #readData(LintClient,File,File)} for documentation on the data format. */ |
| private static void writeDatabase(File file, List<String> lines) throws IOException { |
| /* |
| * 1. A file header, which is the exact contents of {@link FILE_HEADER} encoded |
| * as ASCII characters. The purpose of the header is to identify what the file |
| * is for, for anyone attempting to open the file. |
| * 2. A file version number. If the binary file does not match the reader's expected |
| * version, it can ignore it (and regenerate the cache from XML). |
| */ |
| |
| // Drop comments etc |
| List<String> words = new ArrayList<String>(lines.size()); |
| for (String line : lines) { |
| if (!line.isEmpty() && Character.isLetter(line.charAt(0))) { |
| int end = line.indexOf(WORD_SEPARATOR); |
| if (end == -1) { |
| end = line.trim().length(); |
| } |
| String typo = line.substring(0, end).trim(); |
| String replacements = line.substring(end + WORD_SEPARATOR.length()).trim(); |
| if (replacements.isEmpty()) { |
| // We don't support empty replacements |
| continue; |
| } |
| String combined = typo + (char) 0 + replacements; |
| |
| words.add(combined); |
| } |
| } |
| |
| byte[][] wordArrays = new byte[words.size()][]; |
| for (int i = 0, n = words.size(); i < n; i++) { |
| String word = words.get(i); |
| wordArrays[i] = word.getBytes(Charsets.UTF_8); |
| } |
| // Sort words, using our own comparator to ensure that it matches the |
| // binary search in getTypos() |
| Comparator<byte[]> comparator = new Comparator<byte[]>() { |
| @Override |
| public int compare(byte[] o1, byte[] o2) { |
| return TypoLookup.compare(o1, 0, (byte) 0, o2, 0, o2.length); |
| } |
| }; |
| Arrays.sort(wordArrays, comparator); |
| |
| byte[] headerBytes = FILE_HEADER.getBytes(Charsets.US_ASCII); |
| int entryCount = wordArrays.length; |
| int capacity = entryCount * BYTES_PER_ENTRY + headerBytes.length + 5; |
| ByteBuffer buffer = ByteBuffer.allocate(capacity); |
| buffer.order(ByteOrder.BIG_ENDIAN); |
| // 1. A file header, which is the exact contents of {@link FILE_HEADER} encoded |
| // as ASCII characters. The purpose of the header is to identify what the file |
| // is for, for anyone attempting to open the file. |
| buffer.put(headerBytes); |
| |
| // 2. A file version number. If the binary file does not match the reader's expected |
| // version, it can ignore it (and regenerate the cache from XML). |
| buffer.put((byte) BINARY_FORMAT_VERSION); |
| |
| // 3. The number of words [1 int] |
| buffer.putInt(entryCount); |
| |
| // 4. Word offset table (one integer per word, pointing to the byte offset in the |
| // file (relative to the beginning of the file) where each word begins. |
| // The words are always sorted alphabetically. |
| int wordOffsetTable = buffer.position(); |
| |
| // Reserve enough room for the offset table here: we will backfill it with pointers |
| // as we're writing out the data structures below |
| for (int i = 0, n = entryCount; i < n; i++) { |
| buffer.putInt(0); |
| } |
| |
| int nextEntry = buffer.position(); |
| int nextOffset = wordOffsetTable; |
| |
| // 7. Word entry table. Each word entry consists of the word, followed by the byte 0 |
| // as a terminator, followed by a comma separated list of suggestions (which |
| // may be empty), or a final 0. |
| for (int i = 0; i < entryCount; i++) { |
| byte[] word = wordArrays[i]; |
| buffer.position(nextOffset); |
| buffer.putInt(nextEntry); |
| nextOffset = buffer.position(); |
| buffer.position(nextEntry); |
| |
| buffer.put(word); // already embeds 0 to separate typo from words |
| buffer.put((byte) 0); |
| |
| nextEntry = buffer.position(); |
| } |
| |
| int size = buffer.position(); |
| assert size <= buffer.limit(); |
| buffer.mark(); |
| |
| if (WRITE_STATS) { |
| System.out.println("Wrote " + words.size() + " word entries"); |
| System.out.print("Actual binary size: " + size + " bytes"); |
| System.out.println(String.format(" (%.1fM)", size/(1024*1024.f))); |
| |
| System.out.println("Allocated size: " + (entryCount * BYTES_PER_ENTRY) + " bytes"); |
| System.out.println("Required bytes per entry: " + (size/ entryCount) + " bytes"); |
| } |
| |
| // Now dump this out as a file |
| // There's probably an API to do this more efficiently; TODO: Look into this. |
| byte[] b = new byte[size]; |
| buffer.rewind(); |
| buffer.get(b); |
| FileOutputStream output = Files.newOutputStreamSupplier(file).getOutput(); |
| output.write(b); |
| output.close(); |
| } |
| |
| // For debugging only |
| private String dumpEntry(int offset) { |
| if (DEBUG_SEARCH) { |
| int end = offset; |
| while (mData[end] != 0) { |
| end++; |
| } |
| return new String(mData, offset, end - offset, Charsets.UTF_8); |
| } else { |
| return "<disabled>"; //$NON-NLS-1$ |
| } |
| } |
| |
| /** Comparison function: *only* used for ASCII strings */ |
| @VisibleForTesting |
| static int compare(byte[] data, int offset, byte terminator, CharSequence s, |
| int begin, int end) { |
| int i = offset; |
| int j = begin; |
| for (; ; i++, j++) { |
| byte b = data[i]; |
| if (b == ' ') { |
| // We've matched up to the space in a split-word typo, such as |
| // in German all zu=>allzu; here we've matched just past "all". |
| // Rather than terminating, attempt to continue in the buffer. |
| if (j == end) { |
| int max = s.length(); |
| if (end < max && s.charAt(end) == ' ') { |
| // Find next word |
| for (; end < max; end++) { |
| char c = s.charAt(end); |
| if (!Character.isLetter(c)) { |
| if (c == ' ' && end == j) { |
| continue; |
| } |
| break; |
| } |
| } |
| } |
| } |
| } |
| |
| if (j == end) { |
| break; |
| } |
| |
| if (b == '*') { |
| // Glob match (only supported at the end) |
| return 0; |
| } |
| char c = s.charAt(j); |
| byte cb = (byte) c; |
| int delta = b - cb; |
| if (delta != 0) { |
| cb = (byte) Character.toLowerCase(c); |
| if (b != cb) { |
| // Ensure that it has the right sign |
| b = (byte) Character.toLowerCase(b); |
| delta = b - cb; |
| if (delta != 0) { |
| return delta; |
| } |
| } |
| } |
| } |
| |
| return data[i] - terminator; |
| } |
| |
| /** Comparison function used for general UTF-8 encoded strings */ |
| @VisibleForTesting |
| static int compare(byte[] data, int offset, byte terminator, byte[] s, |
| int begin, int end) { |
| int i = offset; |
| int j = begin; |
| for (; ; i++, j++) { |
| byte b = data[i]; |
| if (b == ' ') { |
| // We've matched up to the space in a split-word typo, such as |
| // in German all zu=>allzu; here we've matched just past "all". |
| // Rather than terminating, attempt to continue in the buffer. |
| // We've matched up to the space in a split-word typo, such as |
| // in German all zu=>allzu; here we've matched just past "all". |
| // Rather than terminating, attempt to continue in the buffer. |
| if (j == end) { |
| int max = s.length; |
| if (end < max && s[end] == ' ') { |
| // Find next word |
| for (; end < max; end++) { |
| byte cb = s[end]; |
| if (!isLetter(cb)) { |
| if (cb == ' ' && end == j) { |
| continue; |
| } |
| break; |
| } |
| } |
| } |
| } |
| } |
| |
| if (j == end) { |
| break; |
| } |
| if (b == '*') { |
| // Glob match (only supported at the end) |
| return 0; |
| } |
| byte cb = s[j]; |
| int delta = b - cb; |
| if (delta != 0) { |
| cb = toLowerCase(cb); |
| b = toLowerCase(b); |
| delta = b - cb; |
| if (delta != 0) { |
| return delta; |
| } |
| } |
| |
| if (b == terminator || cb == terminator) { |
| return delta; |
| } |
| } |
| |
| return data[i] - terminator; |
| } |
| |
| /** |
| * Look up whether this word is a typo, and if so, return the typo itself |
| * and one or more likely meanings |
| * |
| * @param text the string containing the word |
| * @param begin the index of the first character in the word |
| * @param end the index of the first character after the word. Note that the |
| * search may extend <b>beyond</b> this index, if for example the |
| * word matches a multi-word typo in the dictionary |
| * @return a list of the typo itself followed by the replacement strings if |
| * the word represents a typo, and null otherwise |
| */ |
| @Nullable |
| public List<String> getTypos(@NonNull CharSequence text, int begin, int end) { |
| assert end <= text.length(); |
| |
| if (assertionsEnabled()) { |
| for (int i = begin; i < end; i++) { |
| char c = text.charAt(i); |
| if (c >= 128) { |
| assert false : "Call the UTF-8 version of this method instead"; |
| return null; |
| } |
| } |
| } |
| |
| int low = 0; |
| int high = mWordCount - 1; |
| while (low <= high) { |
| int middle = (low + high) >>> 1; |
| int offset = mIndices[middle]; |
| |
| if (DEBUG_SEARCH) { |
| System.out.println("Comparing string " + text +" with entry at " + offset |
| + ": " + dumpEntry(offset)); |
| } |
| |
| // Compare the word at the given index. |
| int compare = compare(mData, offset, (byte) 0, text, begin, end); |
| |
| if (compare == 0) { |
| offset = mIndices[middle]; |
| |
| // Don't allow matching uncapitalized words, such as "enlish", when |
| // the dictionary word is capitalized, "Enlish". |
| if (mData[offset] != text.charAt(begin) |
| && Character.isLowerCase(text.charAt(begin))) { |
| return null; |
| } |
| |
| // Make sure there is a case match; we only want to allow |
| // matching capitalized words to capitalized typos or uncapitalized typos |
| // (e.g. "Teh" and "teh" to "the"), but not uncapitalized words to capitalized |
| // typos (e.g. "enlish" to "Enlish"). |
| String glob = null; |
| for (int i = begin; ; i++) { |
| byte b = mData[offset++]; |
| if (b == 0) { |
| offset--; |
| break; |
| } else if (b == '*') { |
| int globEnd = i; |
| while (globEnd < text.length() |
| && Character.isLetter(text.charAt(globEnd))) { |
| globEnd++; |
| } |
| glob = text.subSequence(i, globEnd).toString(); |
| break; |
| } |
| char c = text.charAt(i); |
| byte cb = (byte) c; |
| if (b != cb && i > begin) { |
| return null; |
| } |
| } |
| |
| return computeSuggestions(mIndices[middle], offset, glob); |
| } |
| |
| if (compare < 0) { |
| low = middle + 1; |
| } else if (compare > 0) { |
| high = middle - 1; |
| } else { |
| assert false; // compare == 0 already handled above |
| return null; |
| } |
| } |
| |
| return null; |
| } |
| |
| /** |
| * Look up whether this word is a typo, and if so, return the typo itself |
| * and one or more likely meanings |
| * |
| * @param utf8Text the string containing the word, encoded as UTF-8 |
| * @param begin the index of the first character in the word |
| * @param end the index of the first character after the word. Note that the |
| * search may extend <b>beyond</b> this index, if for example the |
| * word matches a multi-word typo in the dictionary |
| * @return a list of the typo itself followed by the replacement strings if |
| * the word represents a typo, and null otherwise |
| */ |
| @Nullable |
| public List<String> getTypos(@NonNull byte[] utf8Text, int begin, int end) { |
| assert end <= utf8Text.length; |
| |
| int low = 0; |
| int high = mWordCount - 1; |
| while (low <= high) { |
| int middle = (low + high) >>> 1; |
| int offset = mIndices[middle]; |
| |
| if (DEBUG_SEARCH) { |
| String s = new String(Arrays.copyOfRange(utf8Text, begin, end), Charsets.UTF_8); |
| System.out.println("Comparing string " + s +" with entry at " + offset |
| + ": " + dumpEntry(offset)); |
| System.out.println(" middle=" + middle + ", low=" + low + ", high=" + high); |
| } |
| |
| // Compare the word at the given index. |
| int compare = compare(mData, offset, (byte) 0, utf8Text, begin, end); |
| |
| if (DEBUG_SEARCH) { |
| System.out.println(" signum=" + (int)Math.signum(compare) + ", delta=" + compare); |
| } |
| |
| if (compare == 0) { |
| offset = mIndices[middle]; |
| |
| // Don't allow matching uncapitalized words, such as "enlish", when |
| // the dictionary word is capitalized, "Enlish". |
| if (mData[offset] != utf8Text[begin] && isUpperCase(mData[offset])) { |
| return null; |
| } |
| |
| // Make sure there is a case match; we only want to allow |
| // matching capitalized words to capitalized typos or uncapitalized typos |
| // (e.g. "Teh" and "teh" to "the"), but not uncapitalized words to capitalized |
| // typos (e.g. "enlish" to "Enlish"). |
| String glob = null; |
| for (int i = begin; ; i++) { |
| byte b = mData[offset++]; |
| if (b == 0) { |
| offset--; |
| break; |
| } else if (b == '*') { |
| int globEnd = i; |
| while (globEnd < utf8Text.length && isLetter(utf8Text[globEnd])) { |
| globEnd++; |
| } |
| glob = new String(utf8Text, i, globEnd - i, Charsets.UTF_8); |
| break; |
| } |
| byte cb = utf8Text[i]; |
| if (b != cb && i > begin) { |
| return null; |
| } |
| } |
| |
| return computeSuggestions(mIndices[middle], offset, glob); |
| } |
| |
| if (compare < 0) { |
| low = middle + 1; |
| } else if (compare > 0) { |
| high = middle - 1; |
| } else { |
| assert false; // compare == 0 already handled above |
| return null; |
| } |
| } |
| |
| return null; |
| } |
| |
| private List<String> computeSuggestions(int begin, int offset, String glob) { |
| String typo = new String(mData, begin, offset - begin, Charsets.UTF_8); |
| |
| if (glob != null) { |
| typo = typo.replaceAll("\\*", glob); //$NON-NLS-1$ |
| } |
| |
| assert mData[offset] == 0; |
| offset++; |
| int replacementEnd = offset; |
| while (mData[replacementEnd] != 0) { |
| replacementEnd++; |
| } |
| String replacements = new String(mData, offset, replacementEnd - offset, Charsets.UTF_8); |
| List<String> words = new ArrayList<String>(); |
| words.add(typo); |
| |
| // The first entry should be the typo itself. We need to pass this back since due |
| // to multi-match words and globbing it could extend beyond the initial word range |
| |
| for (String s : Splitter.on(',').omitEmptyStrings().trimResults().split(replacements)) { |
| if (glob != null) { |
| // Need to append the glob string to each result |
| words.add(s.replaceAll("\\*", glob)); //$NON-NLS-1$ |
| } else { |
| words.add(s); |
| } |
| } |
| |
| return words; |
| } |
| |
| // "Character" handling for bytes. This assumes that the bytes correspond to Unicode |
| // characters in the ISO 8859-1 range, which is are encoded the same way in UTF-8. |
| // This obviously won't work to for example uppercase to lowercase conversions for |
| // multi byte characters, which means we simply won't catch typos if the dictionaries |
| // contain these. None of the currently included dictionaries do. However, it does |
| // help us properly deal with punctuation and spacing characters. |
| |
| static boolean isUpperCase(byte b) { |
| return Character.isUpperCase((char) b); |
| } |
| |
| static byte toLowerCase(byte b) { |
| return (byte) Character.toLowerCase((char) b); |
| } |
| |
| static boolean isSpace(byte b) { |
| return Character.isWhitespace((char) b); |
| } |
| |
| static boolean isLetter(byte b) { |
| // Assume that multi byte characters represent letters in other languages. |
| // Obviously, it could be unusual punctuation etc but letters are more likely |
| // in this context. |
| return Character.isLetter((char) b) || (b & 0x80) != 0; |
| } |
| } |