| /* |
| * Copyright (C) 2012 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package com.android.tools.lint.checks; |
| |
| import com.android.tools.lint.client.api.LintClient; |
| import com.android.tools.lint.detector.api.Detector; |
| import com.google.common.base.Charsets; |
| import com.google.common.base.Splitter; |
| import com.google.common.io.Files; |
| |
| import java.io.BufferedWriter; |
| import java.io.File; |
| import java.io.FileWriter; |
| import java.io.Writer; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| import java.util.regex.Pattern; |
| |
| @SuppressWarnings("javadoc") |
| public class TypoLookupTest extends AbstractCheckTest { |
| private static final String SEPARATOR = "->"; |
| |
| public void testCapitalization() throws Exception { |
| LintClient client = new TestLintClient(); |
| // Make sure it can be read in |
| TypoLookup db = TypoLookup.get(client, "de", null); |
| assertNotNull(db); |
| assertNotNull(db.getTypos("Andriod".getBytes(Charsets.UTF_8), 0, "Andriod".length())); |
| } |
| |
| public void testDictionary_English() throws Exception { |
| validateDictionary("en"); |
| } |
| |
| public void testDictionary_German() throws Exception { |
| validateDictionary("de"); |
| } |
| |
| public void testDictionary_Spanish() throws Exception { |
| validateDictionary("es"); |
| } |
| |
| public void testDictionary_Hungarian() throws Exception { |
| validateDictionary("hu"); |
| } |
| |
| public void testDictionary_Italian() throws Exception { |
| validateDictionary("it"); |
| } |
| |
| public void testDictionary_Norwegian() throws Exception { |
| validateDictionary("nb"); |
| } |
| |
| public void testDictionary_Portuguese() throws Exception { |
| validateDictionary("pt"); |
| } |
| |
| public void testDictionary_Turkish() throws Exception { |
| validateDictionary("tr"); |
| } |
| |
| public void test1() { |
| TypoLookup db = TypoLookup.get(new TestLintClient(), "en", null); |
| assertNull(db.getTypos("hello", 0, "hello".length())); |
| assertNull(db.getTypos("this", 0, "this".length())); |
| |
| assertNotNull(db.getTypos("wiht", 0, "wiht".length())); |
| assertNotNull(db.getTypos("woudl", 0, "woudl".length())); |
| assertEquals("would", db.getTypos("woudl", 0, "woudl".length()).get(1)); |
| assertEquals("would", db.getTypos(" woudl ", 2, 7).get(1)); |
| assertNotNull(db.getTypos("foo wiht bar", 4, 8)); |
| |
| List<String> typos = db.getTypos("throught", 0, "throught".length()); |
| assertEquals("throught", typos.get(0)); // the typo |
| assertEquals("thought", typos.get(1)); |
| assertEquals("through", typos.get(2)); |
| assertEquals("throughout", typos.get(3)); |
| |
| // Capitalization handling |
| assertNotNull(db.getTypos("Woudl", 0, "Woudl".length())); |
| assertNotNull(db.getTypos("Enlish", 0, "Enlish".length())); |
| assertNull(db.getTypos("enlish", 0, "enlish".length())); |
| assertNull(db.getTypos("enlish".getBytes(Charsets.UTF_8), 0, "enlish".length())); |
| assertNotNull(db.getTypos("ok", 0, "ok".length())); |
| assertNotNull(db.getTypos("Ok", 0, "Ok".length())); |
| assertNull(db.getTypos("OK", 0, "OK".length())); |
| } |
| |
| public void testRegion() { |
| TypoLookup db = TypoLookup.get(new TestLintClient(), "en", "US"); |
| assertNotNull(db.getTypos("wiht", 0, "wiht".length())); |
| db = TypoLookup.get(new TestLintClient(), "en", "GB"); |
| assertNotNull(db.getTypos("wiht", 0, "wiht".length())); |
| } |
| |
| public void test2() { |
| TypoLookup db = TypoLookup.get(new TestLintClient(), "nb", null); //$NON-NLS-1$ |
| assertNull(db.getTypos("hello", 0, "hello".length())); |
| assertNull(db.getTypos("this", 0, "this".length())); |
| |
| assertNotNull(db.getTypos("altid", 0, "altid".length())); |
| assertEquals("alltid", db.getTypos("altid", 0, "altid".length()).get(1)); |
| assertEquals("alltid", db.getTypos(" altid ", 2, 7).get(1)); |
| assertNotNull(db.getTypos("foo altid bar", 4, 9)); |
| |
| // Test utf-8 string which isn't ASCII |
| String s = "karriære"; |
| byte[] sb = s.getBytes(Charsets.UTF_8); |
| assertNotNull(db.getTypos(sb, 0, sb.length)); |
| |
| assertEquals("karrière", db.getTypos(sb, 0, sb.length).get(1)); |
| } |
| |
| public void testMultiWords() { |
| // Some language dictionaries contain multi-word sequences (e.g. where there's a |
| // space on the left hand side). This needs some particular care in the lookup |
| // which is usually word oriented. |
| TypoLookup db = TypoLookup.get(new TestLintClient(), "de", "DE"); //$NON-NLS-1$ |
| |
| // all zu->allzu |
| |
| // Text handling |
| String t = "all zu"; |
| assertNotNull(db.getTypos(t, 0, t.length())); |
| assertEquals("allzu", db.getTypos(t, 0, t.length()).get(1)); |
| |
| // Byte handling |
| byte[] text = "all zu".getBytes(Charsets.UTF_8); |
| assertNotNull(db.getTypos(text, 0, text.length)); |
| assertEquals("allzu", db.getTypos(text, 0, text.length).get(1)); |
| |
| // Test automatically extending search beyond current word |
| text = "all zu".getBytes(Charsets.UTF_8); |
| assertNotNull(db.getTypos(text, 0, 3)); |
| assertEquals("allzu", db.getTypos(text, 0, text.length).get(1)); |
| |
| |
| text = ") all zu (".getBytes(Charsets.UTF_8); |
| assertNotNull(db.getTypos(text, 2, 8)); |
| assertEquals("allzu", db.getTypos(text, 2, 8).get(1)); |
| |
| text = "am einem".getBytes(Charsets.UTF_8); |
| assertNotNull(db.getTypos(text, 0, text.length)); |
| assertEquals("an einem", db.getTypos(text, 0, text.length).get(1)); |
| } |
| |
| public void testGlobbing() { |
| TypoLookup db = TypoLookup.get(new TestLintClient(), "de", null); |
| |
| // Authorisierung*->Autorisierung* |
| String text = "Authorisierungscode"; |
| byte[] bytes = text.getBytes(Charsets.UTF_8); |
| |
| assertNotNull(db.getTypos(text, 0, text.length())); |
| assertEquals("Autorisierungscode", db.getTypos(text, 0, text.length()).get(1)); |
| assertEquals(text, db.getTypos(text, 0, text.length()).get(0)); |
| |
| assertNotNull(db.getTypos(bytes, 0, bytes.length)); |
| assertEquals("Autorisierungscode", db.getTypos(bytes, 0, bytes.length).get(1)); |
| |
| // befindet ein*->befindet sich ein* |
| text = "wo befindet eine ip"; |
| assertEquals("befindet sich eine", db.getTypos(text, 3, 16).get(1)); |
| |
| // zurück ge*->zurückge* |
| text = "zurück gefoobaren"; |
| bytes = text.getBytes(Charsets.UTF_8); |
| assertNotNull(db.getTypos(bytes, 0, bytes.length)); |
| assertEquals("zurückgefoobaren", db.getTypos(bytes, 0, bytes.length).get(1)); |
| } |
| |
| public void testComparisons() throws Exception { |
| // Ensure that the two comparison methods agree |
| |
| LintClient client = new TestLintClient(); |
| for (String locale : new String[] { "de", "nb", "es", "en", "pt", "hu", "it", "tr" }) { |
| File f = client.findResource(String.format("tools/support/typos-%1$s.txt", locale)); |
| assertTrue(locale, f != null && f.exists()); |
| |
| Set<String> typos = new HashSet<String>(2000); |
| List<String> lines = Files.readLines(f, Charsets.UTF_8); |
| for (int i = 0, n = lines.size(); i < n; i++) { |
| String line = lines.get(i); |
| if (line.isEmpty() || line.trim().startsWith("#")) { //$NON-NLS-1$ |
| continue; |
| } |
| |
| int index = line.indexOf(SEPARATOR); |
| if (index == -1) { |
| continue; |
| } |
| String typo = line.substring(0, index).trim(); |
| typos.add(typo); |
| } |
| |
| List<String> words = new ArrayList<String>(typos); |
| |
| // Make sure that the two comparison methods agree on all the strings |
| // (which should be in a semi-random order now that they're in a set ordered |
| // by their hash codes) |
| |
| String prevText = words.get(0) + '\000'; |
| byte[] prevBytes = prevText.getBytes(Charsets.UTF_8); |
| |
| for (int i = 1; i < words.size(); i++) { |
| String text = words.get(i) + '\000'; |
| byte[] bytes = text.getBytes(Charsets.UTF_8); |
| |
| int textCompare = TypoLookup.compare(prevBytes, 0, (byte) 0, text, 0, |
| text.length()); |
| int byteCompare = TypoLookup.compare(prevBytes, 0, (byte) 0, bytes, 0, |
| bytes.length); |
| assertEquals("Word " + text + " versus prev " + prevText + " at " + i, |
| Math.signum(textCompare), Math.signum(byteCompare)); |
| } |
| } |
| } |
| |
| public void testComparison1() throws Exception { |
| String prevText = "heraus gebracht\u0000"; |
| byte[] prevBytes = prevText.getBytes(Charsets.UTF_8); |
| |
| String text = "Päsident\u0000"; |
| byte[] bytes = text.getBytes(Charsets.UTF_8); |
| |
| |
| int textCompare = TypoLookup.compare(prevBytes, 0, (byte) 0, text, 0, |
| text.length()); |
| int byteCompare = TypoLookup.compare(prevBytes, 0, (byte) 0, bytes, 0, |
| bytes.length); |
| assertTrue(byteCompare < 0); |
| assertTrue(textCompare < 0); |
| assertEquals("Word " + text + " versus prev " + prevText, |
| Math.signum(textCompare), Math.signum(byteCompare)); |
| } |
| |
| public void testComparison2() throws Exception { |
| String prevText = "intepretation\u0000"; |
| byte[] prevBytes = prevText.getBytes(Charsets.UTF_8); |
| |
| String text = "Woudl\u0000"; |
| byte[] bytes = text.getBytes(Charsets.UTF_8); |
| |
| int textCompare = TypoLookup.compare(prevBytes, 0, (byte) 0, text, 0, text.length()); |
| int byteCompare = TypoLookup.compare(prevBytes, 0, (byte) 0, bytes, 0, bytes.length); |
| assertTrue(byteCompare < 0); |
| assertTrue(textCompare < 0); |
| assertEquals("Word " + text + " versus prev " + prevText, |
| Math.signum(textCompare), Math.signum(byteCompare)); |
| |
| // Reverse capitalization and ensure that it's still the same |
| prevText = "Intepretation\u0000"; |
| prevBytes = prevText.getBytes(Charsets.UTF_8); |
| |
| text = "woudl\u0000"; |
| bytes = text.getBytes(Charsets.UTF_8); |
| |
| textCompare = TypoLookup.compare(prevBytes, 0, (byte) 0, text, 0, text.length()); |
| byteCompare = TypoLookup.compare(prevBytes, 0, (byte) 0, bytes, 0, bytes.length); |
| assertTrue(byteCompare < 0); |
| assertTrue(textCompare < 0); |
| assertEquals("Word " + text + " versus prev " + prevText, |
| Math.signum(textCompare), Math.signum(byteCompare)); |
| } |
| |
| // Some dictionaries contain actual sentences regarding usage; these must be stripped out. |
| // They're just hardcoded here as we find them |
| private static final String[] sRemove = new String[] { |
| "- besser ganz darauf verzichten", |
| "oft fälschlich für \"angekündigt\"", |
| "hinausgehende* − insb. „darüber hinausgehende“", |
| " - besser ganz darauf verzichten", |
| "svw. bzw. so viel wie bzw. sprachverwandt" |
| }; |
| |
| private void validateDictionary(String locale) throws Exception { |
| // Check that all the typo files are well formed |
| LintClient client = new TestLintClient(); |
| File f = client.findResource(String.format("tools/support/typos-%1$s.txt", locale)); |
| assertTrue(locale, f != null && f.exists()); |
| |
| Set<String> typos = new HashSet<String>(2000); |
| List<Pattern> patterns = new ArrayList<Pattern>(100); |
| |
| List<String> lines = Files.readLines(f, Charsets.UTF_8); |
| for (int i = 0, n = lines.size(); i < n; i++) { |
| String line = lines.get(i); |
| if (line.isEmpty() || line.trim().startsWith("#")) { //$NON-NLS-1$ |
| continue; |
| } |
| |
| assertTrue(msg(f, i, "Line should contain '->': %1$s", line), |
| line.contains(SEPARATOR)); |
| int index = line.indexOf(SEPARATOR); |
| String typo = line.substring(0, index).trim(); |
| String replacements = line.substring(index + SEPARATOR.length()).trim(); |
| |
| if (typo.contains("*") && !typo.endsWith("*")) { |
| fixDictionary(f); |
| fail(msg(f, i, "Globbing (*) not supported anywhere but at the tail: %1$s", line)); |
| } else if (typo.contains("*") && !replacements.contains("*")) { |
| fail(msg(f, i, "No glob found in the replacements for %1$s", line)); |
| } |
| |
| if (replacements.indexOf(',') != -1) { |
| Set<String> seen = new HashSet<String>(); |
| for (String s : Splitter.on(',').omitEmptyStrings().split(replacements)) { |
| if (seen.contains(s)) { |
| fixDictionary(f); |
| fail(msg(f, i, "For typo " + typo |
| + " there are repeated replacements (" + s + "): " + line)); |
| } |
| } |
| } |
| |
| assertTrue(msg(f, i, "Typo entry was empty: %1$s", line), !typo.isEmpty()); |
| assertTrue(msg(f, i, "Typo replacements was empty: %1$s", line), |
| !replacements.isEmpty()); |
| |
| for (String blacklist : sRemove) { |
| if (replacements.contains(blacklist)) { |
| fail(msg(f, i, "Replacements for typo %1$s contain description: %2$s", |
| typo, replacements)); |
| } |
| } |
| if (typo.equals("sólo") && locale.equals("es")) { |
| // sólo->solo |
| // This seems to trigger a lot of false positives |
| fail(msg(f, i, "Typo %1$s triggers a lot of false positives, should be omitted", |
| typo)); |
| } |
| if (locale.equals("tr") && (typo.equals("hiç bir")|| typo.equals("öğe"))) { |
| // hiç bir->hiçbir |
| // öğe->öge |
| // According to a couple of native speakers these are not necessarily |
| // typos |
| fail(msg(f, i, "Typo %1$s triggers a lot of false positives, should be omitted", |
| typo)); |
| } |
| |
| if (typo.contains("*")) { |
| patterns.add(Pattern.compile(typo.replace("*", ".*"))); |
| } else if (!patterns.isEmpty()) { |
| for (Pattern pattern : patterns) { |
| if (pattern.matcher(typo).matches()) { |
| fixDictionary(f); |
| fail(msg(f, i, "The typo " + typo + " matches an earlier glob: ignoring")); |
| continue; |
| } |
| } |
| } |
| |
| |
| if (typos.contains(typo)) { |
| fixDictionary(f); |
| fail(msg(f, i, "Typo appeared more than once on lhs: %1$s", typo)); |
| } |
| typos.add(typo); |
| } |
| |
| // Make sure it can be read in |
| TypoLookup db = TypoLookup.get(client, locale, null); |
| assertNotNull(db); |
| assertNull(db.getTypos("abcdefghijklmnopqrstuvxyz", 0, 25)); |
| assertNull(db.getTypos("abcdefghijklmnopqrstuvxyz".getBytes(Charsets.UTF_8), 0, 25)); |
| assertNotNull(db.getTypos("Andriod", 0, "Andriod".length())); |
| assertNotNull(db.getTypos("Andriod".getBytes(Charsets.UTF_8), 0, "Andriod".length())); |
| } |
| |
| private void fixDictionary(File original) throws Exception { |
| File fixed = new File(original.getParentFile(), "fixed-" + original.getName()); |
| |
| Map<String, Integer> typos = new HashMap<String, Integer>(2000); |
| List<Pattern> patterns = new ArrayList<Pattern>(100); |
| List<String> lines = Files.readLines(original, Charsets.UTF_8); |
| List<String> output = new ArrayList<String>(lines.size()); |
| |
| wordLoop: |
| for (int i = 0, n = lines.size(); i < n; i++) { |
| String line = lines.get(i); |
| if (line.isEmpty() || line.trim().startsWith("#")) { //$NON-NLS-1$ |
| output.add(line); |
| continue; |
| } |
| |
| if (!line.contains(SEPARATOR)) { |
| System.err.println("Commented out line missing ->: " + line); |
| output.add("# " + line); |
| continue; |
| } |
| int index = line.indexOf(SEPARATOR); |
| String typo = line.substring(0, index).trim(); |
| String replacements = line.substring(index + SEPARATOR.length()).trim(); |
| |
| if (typo.isEmpty()) { |
| System.err.println("Commented out line missing a typo on the lhs: " + line); |
| output.add("# " + line); |
| continue; |
| } |
| if (replacements.isEmpty()) { |
| System.err.println("Commented out line missing replacements on the rhs: " + line); |
| output.add("# " + line); |
| continue; |
| } |
| |
| // Ensure that all the replacements are unique |
| if (replacements.indexOf(',') != -1) { |
| Set<String> seen = new HashSet<String>(); |
| List<String> out = new ArrayList<String>(); |
| boolean rewrite = false; |
| for (String s : Splitter.on(',').omitEmptyStrings().split(replacements)) { |
| if (seen.contains(s)) { |
| System.err.println("For typo " + typo |
| + " there are repeated replacements (" + s + "): " + line); |
| rewrite = true; |
| } |
| seen.add(s); |
| out.add(s); |
| } |
| if (rewrite) { |
| StringBuilder sb = new StringBuilder(); |
| for (String s : out) { |
| if (sb.length() > 0) { |
| sb.append(","); |
| } |
| sb.append(s); |
| } |
| replacements = sb.toString(); |
| line = typo + SEPARATOR + replacements; |
| } |
| } |
| |
| if (typo.contains("*")) { |
| if (!typo.endsWith("*")) { |
| // Globbing not supported anywhere but the end |
| // Drop the whole word |
| System.err.println("Skipping typo " + typo |
| + " because globbing is only supported at the end of the word"); |
| continue; |
| } |
| patterns.add(Pattern.compile(typo.replace("*", ".*"))); |
| } else if (replacements.contains("*")) { |
| System.err.println("Skipping typo " + typo + " because unexpected " + |
| "globbing character found in replacements: " |
| + replacements); |
| continue; |
| } else if (!patterns.isEmpty()) { |
| for (Pattern pattern : patterns) { |
| if (pattern.matcher(typo).matches()) { |
| System.err.println("The typo " + typo |
| + " matches an earlier glob: ignoring"); |
| continue wordLoop; |
| } |
| } |
| } |
| |
| // TODO: Strip whitespace around ->, prefix of # etc such that reading in |
| // the databases needs to do less work at runtime |
| |
| if (typos.containsKey(typo)) { |
| int l = typos.get(typo); |
| String prev = output.get(l); |
| assertTrue(prev.startsWith(typo)); |
| // Append new replacements and put back into the list |
| // (unless they're already listed as replacements) |
| Set<String> seen = new HashSet<String>(); |
| for (String s : Splitter.on(',').split(prev.substring(prev.indexOf(SEPARATOR) |
| + 2))) { |
| seen.add(s); |
| } |
| for (String s : Splitter.on(',').omitEmptyStrings().split(replacements)) { |
| if (!seen.contains(s)) { |
| prev = prev + "," + s; |
| } |
| seen.add(s); |
| } |
| output.set(l, prev); |
| } else { |
| typos.put(typo, output.size()); |
| output.add(line); |
| } |
| } |
| |
| Writer writer = new BufferedWriter(new FileWriter(fixed)); |
| for (String line : output) { |
| writer.write(line); |
| writer.write('\n'); |
| } |
| writer.close(); |
| |
| System.err.println("==> Wrote fixed typo file to " + fixed.getPath()); |
| } |
| |
| private static String msg(File file, int line, String message, Object... args) { |
| return file.getName() + ':' + Integer.toString(line + 1) + ':' + ' ' + |
| String.format(message, args); |
| } |
| |
| @Override |
| protected Detector getDetector() { |
| fail("This is not used in the TypoLookupTest"); |
| return null; |
| } |
| } |