| /* |
| * Copyright (C) 2012 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package com.android.tools.lint.checks |
| |
| import com.android.tools.lint.client.api.LintClient |
| import com.android.tools.lint.detector.api.assertionsEnabled |
| import com.google.common.annotations.VisibleForTesting |
| import com.google.common.base.Charsets |
| import com.google.common.base.Splitter |
| import com.google.common.io.ByteStreams |
| import com.google.common.io.Files |
| import java.io.BufferedInputStream |
| import java.io.File |
| import java.io.FileInputStream |
| import java.io.FileNotFoundException |
| import java.io.IOException |
| import java.io.InputStream |
| import java.nio.ByteBuffer |
| import java.nio.ByteOrder |
| import java.nio.channels.FileChannel.MapMode |
| import java.util.ArrayList |
| import java.util.Arrays |
| import java.util.Random |
| import java.util.WeakHashMap |
| |
| /** Database of common typos / misspellings. */ |
| class TypoLookup private constructor( |
| private var data: ByteArray, |
| private var indices: IntArray, |
| private var wordCount: Int = 0 |
| ) { |
| /** |
| * Look up whether this word is a typo, and if so, return the typo itself and one or more likely |
| * meanings |
| * |
| * @param text the string containing the word |
| * @param begin the index of the first character in the word |
| * @param end the index of the first character after the word. Note that the search may extend |
| * **beyond** this index, if for example the word matches a multi-word typo in the |
| * dictionary |
| * @return a list of the typo itself followed by the replacement strings if the word represents |
| * a typo, and null otherwise |
| */ |
| fun getTypos(text: CharSequence, begin: Int, end: Int): List<String>? { |
| assert(end <= text.length) |
| |
| if (assertionsEnabled()) { |
| for (i in begin until end) { |
| val c = text[i] |
| if (c.toInt() >= 128) { |
| assert(false) { "Call the UTF-8 version of this method instead" } |
| return null |
| } |
| } |
| } |
| |
| var low = 0 |
| var high = wordCount - 1 |
| while (low <= high) { |
| val middle = (low + high).ushr(1) |
| var offset = indices[middle] |
| |
| // Compare the word at the given index. |
| val compare = compare(data, offset, 0.toByte(), text, begin, end) |
| |
| if (compare == 0) { |
| offset = indices[middle] |
| |
| // Don't allow matching uncapitalized words, such as "enlish", when |
| // the dictionary word is capitalized, "Enlish". |
| if (data[offset] != text[begin].toByte() && Character.isLowerCase(text[begin])) { |
| return null |
| } |
| |
| // Make sure there is a case match; we only want to allow |
| // matching capitalized words to capitalized typos or uncapitalized typos |
| // (e.g. "Teh" and "teh" to "the"), but not uncapitalized words to capitalized |
| // typos (e.g. "enlish" to "Enlish"). |
| var glob: String? = null |
| var i = begin |
| while (true) { |
| val b = data[offset++] |
| if (b.toInt() == 0) { |
| offset-- |
| break |
| } else if (b == '*'.toByte()) { |
| var globEnd = i |
| while (globEnd < text.length && Character.isLetter(text[globEnd])) { |
| globEnd++ |
| } |
| glob = text.subSequence(i, globEnd).toString() |
| break |
| } |
| val c = text[i] |
| val cb = c.toByte() |
| if (b != cb && i > begin) { |
| return null |
| } |
| i++ |
| } |
| |
| return computeSuggestions(indices[middle], offset, glob) |
| } |
| |
| if (compare < 0) { |
| low = middle + 1 |
| } else { |
| high = middle - 1 |
| } |
| } |
| |
| return null |
| } |
| |
| /** |
| * Look up whether this word is a typo, and if so, return the typo itself and one or more likely |
| * meanings |
| * |
| * @param utf8Text the string containing the word, encoded as UTF-8 |
| * @param begin the index of the first character in the word |
| * @param end the index of the first character after the word. Note that the search may extend |
| * **beyond** this index, if for example the word matches a multi-word typo in the |
| * dictionary |
| * @return a list of the typo itself followed by the replacement strings if the word represents |
| * a typo, and null otherwise |
| */ |
| fun getTypos(utf8Text: ByteArray, begin: Int, end: Int): List<String>? { |
| assert(end <= utf8Text.size) |
| |
| var low = 0 |
| var high = wordCount - 1 |
| while (low <= high) { |
| val middle = (low + high).ushr(1) |
| var offset = indices[middle] |
| |
| // Compare the word at the given index. |
| val compare = compare(data, offset, 0.toByte(), utf8Text, begin, end) |
| |
| if (compare == 0) { |
| offset = indices[middle] |
| |
| // Don't allow matching uncapitalized words, such as "enlish", when |
| // the dictionary word is capitalized, "Enlish". |
| if (data[offset] != utf8Text[begin] && isUpperCase(data[offset])) { |
| return null |
| } |
| |
| // Make sure there is a case match; we only want to allow |
| // matching capitalized words to capitalized typos or uncapitalized typos |
| // (e.g. "Teh" and "teh" to "the"), but not uncapitalized words to capitalized |
| // typos (e.g. "enlish" to "Enlish"). |
| var glob: String? = null |
| var i = begin |
| while (true) { |
| val b = data[offset++] |
| if (b.toInt() == 0) { |
| offset-- |
| break |
| } else if (b == '*'.toByte()) { |
| var globEnd = i |
| while (globEnd < utf8Text.size && isLetter(utf8Text[globEnd])) { |
| globEnd++ |
| } |
| glob = String(utf8Text, i, globEnd - i, Charsets.UTF_8) |
| break |
| } |
| val cb = utf8Text[i] |
| if (b != cb && i > begin) { |
| return null |
| } |
| i++ |
| } |
| |
| return computeSuggestions(indices[middle], offset, glob) |
| } |
| |
| if (compare < 0) { |
| low = middle + 1 |
| } else { |
| high = middle - 1 |
| } |
| } |
| |
| return null |
| } |
| |
| private fun computeSuggestions(begin: Int, initialOffset: Int, glob: String?): List<String> { |
| var offset = initialOffset |
| var typo = String(data, begin, offset - begin, Charsets.UTF_8) |
| |
| if (glob != null) { |
| typo = typo.replace("\\*".toRegex(), glob) |
| } |
| |
| assert(data[offset].toInt() == 0) |
| offset++ |
| var replacementEnd = offset |
| while (data[replacementEnd].toInt() != 0) { |
| replacementEnd++ |
| } |
| val replacements = String(data, offset, replacementEnd - offset, Charsets.UTF_8) |
| val words = ArrayList<String>() |
| words.add(typo) |
| |
| // The first entry should be the typo itself. We need to pass this back since due |
| // to multi-match words and globbing it could extend beyond the initial word range |
| |
| for (s in Splitter.on(',').omitEmptyStrings().trimResults().split(replacements)) { |
| if (glob != null) { |
| // Need to append the glob string to each result |
| words.add(s.replace("\\*".toRegex(), glob)) |
| } else { |
| words.add(s) |
| } |
| } |
| |
| return words |
| } |
| |
| companion object { |
| private val NONE = TypoLookup(ByteArray(0), IntArray(0), 0) |
| |
| /** String separating misspellings and suggested replacements in the text file */ |
| private const val WORD_SEPARATOR = "->" |
| |
| private const val FILE_HEADER = "Typo database used by Android lint\u0000" |
| private const val BINARY_FORMAT_VERSION = 2 |
| private const val DEBUG_FORCE_REGENERATE_BINARY = false |
| |
| /** Default size to reserve for each API entry when creating byte buffer to build up data */ |
| private const val BYTES_PER_ENTRY = 28 |
| |
| private val instanceMap = WeakHashMap<String, TypoLookup>() |
| |
| /** |
| * Returns an instance of the Typo database for the given locale |
| * |
| * @param client the client to associate with this database - used only for logging. The |
| * database object may be shared among repeated invocations, and in that case client used |
| * will be the one originally passed in. In other words, this parameter may be ignored if |
| * the client created is not new. |
| * @param locale the locale to look up a typo database for (should be a language code (ISO |
| * 639-1, two lowercase character names) |
| * @param region the region to look up a typo database for (should be a two letter ISO 3166-1 |
| * alpha-2 country code in upper case) language code |
| * @return a (possibly shared) instance of the typo database, or null if its data can't be found |
| */ |
| @JvmStatic |
| operator fun get( |
| client: LintClient, |
| locale: String, |
| region: String? |
| ): TypoLookup? { |
| synchronized(TypoLookup::class.java) { |
| var key = locale |
| |
| if (region != null && region.length == 2) { // skip BCP-47 regions |
| // Allow for region-specific dictionaries. See for example |
| // http://en.wikipedia.org/wiki/American_and_British_English_spelling_differences |
| assert(Character.isUpperCase(region[0]) && Character.isUpperCase(region[1])) { region } |
| // Look for typos-en-rUS.txt etc |
| key = locale + 'r'.toString() + region |
| } |
| |
| var db: TypoLookup? = instanceMap[key] |
| if (db == null) { |
| val name = "typos-$key.txt" |
| val path = "/typos/$name" |
| var stream: InputStream? = TypoLookup::class.java.getResourceAsStream(path) |
| if (stream == null) { |
| // AOSP build environment? |
| val build = System.getenv("ANDROID_BUILD_TOP") |
| if (build != null) { |
| val file = File( |
| build, |
| "sdk/files$path".replace('/', File.separatorChar) |
| ) |
| if (file.exists()) { |
| try { |
| // noinspection resource,IOResourceOpenedButNotSafelyClosed |
| stream = BufferedInputStream(FileInputStream(file)) |
| } catch (ignore: FileNotFoundException) { |
| } |
| } |
| } |
| } |
| |
| if (stream == null) { |
| |
| if (region != null) { |
| // Fall back to the generic locale (non-region-specific) database |
| return get(client, locale, null) |
| } |
| db = NONE |
| } else { |
| db = get(client, stream, name) |
| assert(db != null) { name } |
| } |
| instanceMap[key] = db |
| } |
| |
| return if (db === NONE) { |
| null |
| } else { |
| db |
| } |
| } |
| } |
| |
| /** |
| * Returns an instance of the typo database |
| * |
| * @param client the client to associate with this database - used only for logging |
| * @param xmlStream the XML file containing configuration data to use for this database |
| * @param name name to use for cache file |
| * @return a (possibly shared) instance of the typo database, or null if its data can't be found |
| */ |
| private operator fun get( |
| client: LintClient, |
| xmlStream: InputStream, |
| name: String |
| ): TypoLookup? { |
| val cacheDir = client.getCacheDir(null, true) |
| ?: return null // should not happen since create=true above |
| |
| val binaryData = File( |
| cacheDir, |
| name + |
| // Incorporate version number in the filename to avoid upgrade filename |
| // conflicts on Windows (such as issue #26663) |
| '-'.toString() + |
| BINARY_FORMAT_VERSION + |
| ".bin" |
| ) |
| |
| @Suppress("ConstantConditionIf") |
| if (DEBUG_FORCE_REGENERATE_BINARY) { |
| System.err.println( |
| "\nTemporarily regenerating binary data unconditionally \nfrom $xmlStream\nto $binaryData" |
| ) |
| if (!createCache(client, xmlStream, binaryData)) { |
| return null |
| } |
| } else if (!binaryData.exists()) { |
| if (!createCache(client, xmlStream, binaryData)) { |
| return null |
| } |
| } |
| |
| if (!binaryData.exists()) { |
| client.log(null, "The typo database file %1\$s does not exist", binaryData) |
| return null |
| } |
| |
| return readData(client, xmlStream, binaryData) |
| } |
| |
| private fun readData( |
| client: LintClient, |
| xmlStream: InputStream, |
| binaryFile: File? |
| ): TypoLookup? { |
| binaryFile ?: return null |
| |
| if (!binaryFile.exists()) { |
| client.log(null, "%1\$s does not exist", binaryFile) |
| return null |
| } |
| |
| try { |
| val buffer = Files.map(binaryFile, MapMode.READ_ONLY) |
| assert(buffer.order() == ByteOrder.BIG_ENDIAN) |
| |
| // First skip the header |
| val expectedHeader = FILE_HEADER.toByteArray(Charsets.US_ASCII) |
| buffer.rewind() |
| for (anExpectedHeader in expectedHeader) { |
| if (anExpectedHeader != buffer.get()) { |
| client.log( |
| null, |
| "Incorrect file header: not an typo database cache file, or a corrupt cache file" |
| ) |
| return null |
| } |
| } |
| |
| // Read in the format number |
| if (buffer.get().toInt() != BINARY_FORMAT_VERSION) { |
| // Force regeneration of new binary data with up to date format |
| if (createCache(client, xmlStream, binaryFile)) { |
| return readData(client, xmlStream, binaryFile) // Recurse |
| } |
| |
| return null |
| } |
| |
| val wordCount = buffer.int |
| |
| // Read in the word table indices; |
| val offsets = IntArray(wordCount) |
| |
| // Another idea: I can just store the DELTAS in the file (and add them up |
| // when reading back in) such that it takes just ONE byte instead of four! |
| |
| for (i in 0 until wordCount) { |
| offsets[i] = buffer.int |
| } |
| |
| // No need to read in the rest -- we'll just keep the whole byte array in memory |
| // TODO: Make this code smarter/more efficient. |
| val size = buffer.limit() |
| val b = ByteArray(size) |
| buffer.rewind() |
| buffer.get(b) |
| |
| // TODO: We only need to keep the data portion here since we've initialized |
| // the offset array separately. |
| // TODO: Investigate (profile) accessing the byte buffer directly instead of |
| // accessing a byte array. |
| return TypoLookup(b, offsets, wordCount) |
| } catch (e: IOException) { |
| client.log(e, null) |
| return null |
| } |
| } |
| |
| private fun createCache( |
| client: LintClient, |
| xmlStream: InputStream, |
| binaryData: File |
| ): Boolean { |
| // Read in data |
| val lines: Array<String> |
| try { |
| lines = String( |
| ByteStreams.toByteArray(xmlStream), |
| Charsets.UTF_8 |
| ).split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray() |
| } catch (e: IOException) { |
| client.log(e, "Can't read typo database file") |
| return false |
| } |
| |
| try { |
| writeDatabase(binaryData, lines) |
| return true |
| } catch (ioe: IOException) { |
| client.log(ioe, "Can't write typo cache file") |
| } |
| |
| return false |
| } |
| |
| /** |
| * See the [.readData] for documentation on the data |
| * format. |
| */ |
| @Throws(IOException::class) |
| private fun writeDatabase(file: File, lines: Array<String>) { |
| /* |
| * 1. A file header, which is the exact contents of {@link FILE_HEADER} encoded |
| * as ASCII characters. The purpose of the header is to identify what the file |
| * is for, for anyone attempting to open the file. |
| * 2. A file version number. If the binary file does not match the reader's expected |
| * version, it can ignore it (and regenerate the cache from XML). |
| */ |
| |
| // Drop comments etc |
| val words = ArrayList<String>(lines.size) |
| for (line in lines) { |
| if (!line.isEmpty() && Character.isLetter(line[0])) { |
| var end = line.indexOf(WORD_SEPARATOR) |
| if (end == -1) { |
| end = line.trim { it <= ' ' }.length |
| } |
| val typo = line.substring(0, end).trim { it <= ' ' } |
| val replacements = |
| line.substring(end + WORD_SEPARATOR.length).trim { it <= ' ' } |
| if (replacements.isEmpty()) { |
| // We don't support empty replacements |
| continue |
| } |
| val combined = typo + 0.toChar() + replacements |
| |
| words.add(combined) |
| } |
| } |
| |
| val wordArrays = arrayOfNulls<ByteArray>(words.size) |
| run { |
| var i = 0 |
| val n = words.size |
| while (i < n) { |
| val word = words[i] |
| wordArrays[i] = word.toByteArray(Charsets.UTF_8) |
| i++ |
| } |
| } |
| // Sort words, using our own comparator to ensure that it matches the |
| // binary search in getTypos() |
| Arrays.sort<ByteArray>(wordArrays) { o1, o2 -> |
| compare(o1, 0, 0.toByte(), o2, 0, o2.size) |
| } |
| |
| val headerBytes = FILE_HEADER.toByteArray(Charsets.US_ASCII) |
| val entryCount = wordArrays.size |
| val capacity = entryCount * BYTES_PER_ENTRY + headerBytes.size + 5 |
| val buffer = ByteBuffer.allocate(capacity) |
| buffer.order(ByteOrder.BIG_ENDIAN) |
| // 1. A file header, which is the exact contents of {@link FILE_HEADER} encoded |
| // as ASCII characters. The purpose of the header is to identify what the file |
| // is for, for anyone attempting to open the file. |
| buffer.put(headerBytes) |
| |
| // 2. A file version number. If the binary file does not match the reader's expected |
| // version, it can ignore it (and regenerate the cache from XML). |
| buffer.put(BINARY_FORMAT_VERSION.toByte()) |
| |
| // 3. The number of words [1 int] |
| buffer.putInt(entryCount) |
| |
| // 4. Word offset table (one integer per word, pointing to the byte offset in the |
| // file (relative to the beginning of the file) where each word begins. |
| // The words are always sorted alphabetically. |
| val wordOffsetTable = buffer.position() |
| |
| // Reserve enough room for the offset table here: we will backfill it with pointers |
| // as we're writing out the data structures below |
| for (i in 0 until entryCount) { |
| buffer.putInt(0) |
| } |
| |
| var nextEntry = buffer.position() |
| var nextOffset = wordOffsetTable |
| |
| // 7. Word entry table. Each word entry consists of the word, followed by the byte 0 |
| // as a terminator, followed by a comma separated list of suggestions (which |
| // may be empty), or a final 0. |
| for (word in wordArrays) { |
| buffer.position(nextOffset) |
| buffer.putInt(nextEntry) |
| nextOffset = buffer.position() |
| buffer.position(nextEntry) |
| |
| buffer.put(word) // already embeds 0 to separate typo from words |
| buffer.put(0.toByte()) |
| |
| nextEntry = buffer.position() |
| } |
| |
| val size = buffer.position() |
| assert(size <= buffer.limit()) |
| buffer.mark() |
| |
| // Now dump this out as a file |
| // There's probably an API to do this more efficiently; TODO: Look into this. |
| val b = ByteArray(size) |
| buffer.rewind() |
| buffer.get(b) |
| // Write to a different file and swap it in last minute. |
| // This helps in scenarios where multiple simultaneous Gradle |
| // threads are attempting to access the file before it's ready. |
| val tmp = File(file.path + "." + Random().nextInt()) |
| Files.asByteSink(tmp).write(b) |
| if (!tmp.renameTo(file)) { |
| tmp.delete() |
| } |
| } |
| |
| /** Comparison function: *only* used for ASCII strings */ |
| @VisibleForTesting |
| @JvmStatic |
| fun compare( |
| data: ByteArray, |
| offset: Int, |
| terminator: Byte, |
| s: CharSequence, |
| begin: Int, |
| initialEnd: Int |
| ): Int { |
| var end = initialEnd |
| var i = offset |
| var j = begin |
| while (true) { |
| var b = data[i] |
| if (b == ' '.toByte()) { |
| // We've matched up to the space in a split-word typo, such as |
| // in German all zu⇒allzu; here we've matched just past "all". |
| // Rather than terminating, attempt to continue in the buffer. |
| if (j == end) { |
| val max = s.length |
| if (end < max && s[end] == ' ') { |
| // Find next word |
| while (end < max) { |
| val c = s[end] |
| if (!Character.isLetter(c)) { |
| if (c == ' ' && end == j) { |
| end++ |
| continue |
| } |
| break |
| } |
| end++ |
| } |
| } |
| } |
| } |
| |
| if (j == end) { |
| break |
| } |
| |
| if (b == '*'.toByte()) { |
| // Glob match (only supported at the end) |
| return 0 |
| } |
| val c = s[j] |
| var cb = c.toByte() |
| var delta = b - cb |
| if (delta != 0) { |
| cb = Character.toLowerCase(c).toByte() |
| if (b != cb) { |
| // Ensure that it has the right sign |
| b = Character.toLowerCase(b.toInt()).toByte() |
| delta = b - cb |
| if (delta != 0) { |
| return delta |
| } |
| } |
| } |
| i++ |
| j++ |
| } |
| |
| return data[i] - terminator |
| } |
| |
| /** Comparison function used for general UTF-8 encoded strings */ |
| @VisibleForTesting |
| @JvmStatic |
| fun compare( |
| data: ByteArray, |
| offset: Int, |
| terminator: Byte, |
| s: ByteArray, |
| begin: Int, |
| initialEnd: Int |
| ): Int { |
| var end = initialEnd |
| var i = offset |
| var j = begin |
| while (true) { |
| var b = data[i] |
| if (b == ' '.toByte()) { |
| // We've matched up to the space in a split-word typo, such as |
| // in German all zu⇒allzu; here we've matched just past "all". |
| // Rather than terminating, attempt to continue in the buffer. |
| // We've matched up to the space in a split-word typo, such as |
| // in German all zu⇒allzu; here we've matched just past "all". |
| // Rather than terminating, attempt to continue in the buffer. |
| if (j == end) { |
| val max = s.size |
| if (end < max && s[end] == ' '.toByte()) { |
| // Find next word |
| while (end < max) { |
| val cb = s[end] |
| if (!isLetter(cb)) { |
| if (cb == ' '.toByte() && end == j) { |
| end++ |
| continue |
| } |
| break |
| } |
| end++ |
| } |
| } |
| } |
| } |
| |
| if (j == end) { |
| break |
| } |
| if (b == '*'.toByte()) { |
| // Glob match (only supported at the end) |
| return 0 |
| } |
| var cb = s[j] |
| var delta = b - cb |
| if (delta != 0) { |
| cb = toLowerCase(cb) |
| b = toLowerCase(b) |
| delta = b - cb |
| if (delta != 0) { |
| return delta |
| } |
| } |
| |
| if (b == terminator || cb == terminator) { |
| return delta |
| } |
| i++ |
| j++ |
| } |
| |
| return data[i] - terminator |
| } |
| |
| // "Character" handling for bytes. This assumes that the bytes correspond to Unicode |
| // characters in the ISO 8859-1 range, which is are encoded the same way in UTF-8. |
| // This obviously won't work to for example uppercase to lowercase conversions for |
| // multi byte characters, which means we simply won't catch typos if the dictionaries |
| // contain these. None of the currently included dictionaries do. However, it does |
| // help us properly deal with punctuation and spacing characters. |
| |
| private fun isUpperCase(b: Byte): Boolean { |
| return Character.isUpperCase(b.toChar()) |
| } |
| |
| private fun toLowerCase(b: Byte): Byte { |
| return Character.toLowerCase(b.toChar()).toByte() |
| } |
| |
| @JvmStatic |
| fun isLetter(b: Byte): Boolean { |
| // Assume that multi byte characters represent letters in other languages. |
| // Obviously, it could be unusual punctuation etc but letters are more likely |
| // in this context. |
| return Character.isLetter(b.toChar()) || b.toInt() and 0x80 != 0 |
| } |
| } |
| } |