| /* |
| * Copyright 2018 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package androidx.core.text.util; |
| |
| import static androidx.annotation.RestrictTo.Scope.LIBRARY_GROUP; |
| |
| import androidx.annotation.RestrictTo; |
| import androidx.annotation.VisibleForTesting; |
| |
| import java.util.Locale; |
| import java.util.regex.MatchResult; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| /** |
| * Support copy of https://cs.chromium.org/chromium/src/android_webview/java/src/org/chromium |
| * /android_webview/FindAddress.java |
| * |
| * @hide |
| */ |
| @RestrictTo(LIBRARY_GROUP) |
| class FindAddress { |
| private static class ZipRange { |
| int mLow; |
| int mHigh; |
| int mException1; |
| int mException2; |
| |
| ZipRange(int low, int high, int exception1, int exception2) { |
| mLow = low; |
| mHigh = high; |
| mException1 = exception1; |
| mException2 = exception1; |
| } |
| |
| boolean matches(String zipCode) { |
| int prefix = Integer.parseInt(zipCode.substring(0, 2)); |
| return (mLow <= prefix && prefix <= mHigh) || prefix == mException1 |
| || prefix == mException2; |
| } |
| } |
| |
| // Addresses consist of at least this many words, not including state and zip code. |
| private static final int MIN_ADDRESS_WORDS = 4; |
| |
| // Adddresses consist of at most this many words, not including state and zip code. |
| private static final int MAX_ADDRESS_WORDS = 14; |
| |
| // Addresses consist of at most this many lines. |
| private static final int MAX_ADDRESS_LINES = 5; |
| |
| // No words in an address are longer than this many characters. |
| private static final int kMaxAddressNameWordLength = 25; |
| |
| // Location name should be in the first MAX_LOCATION_NAME_DISTANCE words |
| private static final int MAX_LOCATION_NAME_DISTANCE = 5; |
| |
| private static final ZipRange[] sStateZipCodeRanges = { |
| new ZipRange(99, 99, -1, -1), // AK Alaska. |
| new ZipRange(35, 36, -1, -1), // AL Alabama. |
| new ZipRange(71, 72, -1, -1), // AR Arkansas. |
| new ZipRange(96, 96, -1, -1), // AS American Samoa. |
| new ZipRange(85, 86, -1, -1), // AZ Arizona. |
| new ZipRange(90, 96, -1, -1), // CA California. |
| new ZipRange(80, 81, -1, -1), // CO Colorado. |
| new ZipRange(6, 6, -1, -1), // CT Connecticut. |
| new ZipRange(20, 20, -1, -1), // DC District of Columbia. |
| new ZipRange(19, 19, -1, -1), // DE Delaware. |
| new ZipRange(32, 34, -1, -1), // FL Florida. |
| new ZipRange(96, 96, -1, -1), // FM Federated States of Micronesia. |
| new ZipRange(30, 31, -1, -1), // GA Georgia. |
| new ZipRange(96, 96, -1, -1), // GU Guam. |
| new ZipRange(96, 96, -1, -1), // HI Hawaii. |
| new ZipRange(50, 52, -1, -1), // IA Iowa. |
| new ZipRange(83, 83, -1, -1), // ID Idaho. |
| new ZipRange(60, 62, -1, -1), // IL Illinois. |
| new ZipRange(46, 47, -1, -1), // IN Indiana. |
| new ZipRange(66, 67, 73, -1), // KS Kansas. |
| new ZipRange(40, 42, -1, -1), // KY Kentucky. |
| new ZipRange(70, 71, -1, -1), // LA Louisiana. |
| new ZipRange(1, 2, -1, -1), // MA Massachusetts. |
| new ZipRange(20, 21, -1, -1), // MD Maryland. |
| new ZipRange(3, 4, -1, -1), // ME Maine. |
| new ZipRange(96, 96, -1, -1), // MH Marshall Islands. |
| new ZipRange(48, 49, -1, -1), // MI Michigan. |
| new ZipRange(55, 56, -1, -1), // MN Minnesota. |
| new ZipRange(63, 65, -1, -1), // MO Missouri. |
| new ZipRange(96, 96, -1, -1), // MP Northern Mariana Islands. |
| new ZipRange(38, 39, -1, -1), // MS Mississippi. |
| new ZipRange(55, 56, -1, -1), // MT Montana. |
| new ZipRange(27, 28, -1, -1), // NC North Carolina. |
| new ZipRange(58, 58, -1, -1), // ND North Dakota. |
| new ZipRange(68, 69, -1, -1), // NE Nebraska. |
| new ZipRange(3, 4, -1, -1), // NH New Hampshire. |
| new ZipRange(7, 8, -1, -1), // NJ New Jersey. |
| new ZipRange(87, 88, 86, -1), // NM New Mexico. |
| new ZipRange(88, 89, 96, -1), // NV Nevada. |
| new ZipRange(10, 14, 0, 6), // NY New York. |
| new ZipRange(43, 45, -1, -1), // OH Ohio. |
| new ZipRange(73, 74, -1, -1), // OK Oklahoma. |
| new ZipRange(97, 97, -1, -1), // OR Oregon. |
| new ZipRange(15, 19, -1, -1), // PA Pennsylvania. |
| new ZipRange(6, 6, 0, 9), // PR Puerto Rico. |
| new ZipRange(96, 96, -1, -1), // PW Palau. |
| new ZipRange(2, 2, -1, -1), // RI Rhode Island. |
| new ZipRange(29, 29, -1, -1), // SC South Carolina. |
| new ZipRange(57, 57, -1, -1), // SD South Dakota. |
| new ZipRange(37, 38, -1, -1), // TN Tennessee. |
| new ZipRange(75, 79, 87, 88), // TX Texas. |
| new ZipRange(84, 84, -1, -1), // UT Utah. |
| new ZipRange(22, 24, 20, -1), // VA Virginia. |
| new ZipRange(6, 9, -1, -1), // VI Virgin Islands. |
| new ZipRange(5, 5, -1, -1), // VT Vermont. |
| new ZipRange(98, 99, -1, -1), // WA Washington. |
| new ZipRange(53, 54, -1, -1), // WI Wisconsin. |
| new ZipRange(24, 26, -1, -1), // WV West Virginia. |
| new ZipRange(82, 83, -1, -1) // WY Wyoming. |
| }; |
| |
| // Newlines |
| private static final String NL = "\n\u000B\u000C\r\u0085\u2028\u2029"; |
| |
| // Space characters |
| private static final String SP = "\u0009\u0020\u00A0\u1680\u2000\u2001" |
| + "\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u202F" |
| + "\u205F\u3000"; |
| |
| // Whitespace |
| private static final String WS = SP + NL; |
| |
| // Characters that are considered word delimiters. |
| private static final String WORD_DELIM = ",*\u2022" + WS; |
| |
| // Lookahead for word end. |
| private static final String WORD_END = "(?=[" + WORD_DELIM + "]|$)"; |
| |
| // Address words are a sequence of non-delimiter characters. |
| private static final Pattern sWordRe = |
| Pattern.compile("[^" + WORD_DELIM + "]+" + WORD_END, Pattern.CASE_INSENSITIVE); |
| |
| // Characters that are considered suffix delimiters for house numbers. |
| private static final String HOUSE_POST_DELIM = ",\"'" + WS; |
| |
| // Lookahead for house end. |
| private static final String HOUSE_END = "(?=[" + HOUSE_POST_DELIM + "]|$)"; |
| |
| // Characters that are considered prefix delimiters for house numbers. |
| private static final String HOUSE_PRE_DELIM = ":" + HOUSE_POST_DELIM; |
| |
| // A house number component is "one" or a number, optionally |
| // followed by a single alphabetic character, or |
| private static final String HOUSE_COMPONENT = "(?:one|\\d+([a-z](?=[^a-z]|$)|st|nd|rd|th)?)"; |
| |
| // House numbers are a repetition of |HOUSE_COMPONENT|, separated by -, and followed by |
| // a delimiter character. |
| private static final Pattern sHouseNumberRe = |
| Pattern.compile(HOUSE_COMPONENT + "(?:-" + HOUSE_COMPONENT + ")*" + HOUSE_END, |
| Pattern.CASE_INSENSITIVE); |
| |
| // XXX: do we want to accept whitespace other than 0x20 in state names? |
| private static final Pattern sStateRe = Pattern.compile("(?:" |
| + "(ak|alaska)|" |
| + "(al|alabama)|" |
| + "(ar|arkansas)|" |
| + "(as|american[" + SP + "]+samoa)|" |
| + "(az|arizona)|" |
| + "(ca|california)|" |
| + "(co|colorado)|" |
| + "(ct|connecticut)|" |
| + "(dc|district[" + SP + "]+of[" + SP + "]+columbia)|" |
| + "(de|delaware)|" |
| + "(fl|florida)|" |
| + "(fm|federated[" + SP + "]+states[" + SP + "]+of[" + SP + "]+micronesia)|" |
| + "(ga|georgia)|" |
| + "(gu|guam)|" |
| + "(hi|hawaii)|" |
| + "(ia|iowa)|" |
| + "(id|idaho)|" |
| + "(il|illinois)|" |
| + "(in|indiana)|" |
| + "(ks|kansas)|" |
| + "(ky|kentucky)|" |
| + "(la|louisiana)|" |
| + "(ma|massachusetts)|" |
| + "(md|maryland)|" |
| + "(me|maine)|" |
| + "(mh|marshall[" + SP + "]+islands)|" |
| + "(mi|michigan)|" |
| + "(mn|minnesota)|" |
| + "(mo|missouri)|" |
| + "(mp|northern[" + SP + "]+mariana[" + SP + "]+islands)|" |
| + "(ms|mississippi)|" |
| + "(mt|montana)|" |
| + "(nc|north[" + SP + "]+carolina)|" |
| + "(nd|north[" + SP + "]+dakota)|" |
| + "(ne|nebraska)|" |
| + "(nh|new[" + SP + "]+hampshire)|" |
| + "(nj|new[" + SP + "]+jersey)|" |
| + "(nm|new[" + SP + "]+mexico)|" |
| + "(nv|nevada)|" |
| + "(ny|new[" + SP + "]+york)|" |
| + "(oh|ohio)|" |
| + "(ok|oklahoma)|" |
| + "(or|oregon)|" |
| + "(pa|pennsylvania)|" |
| + "(pr|puerto[" + SP + "]+rico)|" |
| + "(pw|palau)|" |
| + "(ri|rhode[" + SP + "]+island)|" |
| + "(sc|south[" + SP + "]+carolina)|" |
| + "(sd|south[" + SP + "]+dakota)|" |
| + "(tn|tennessee)|" |
| + "(tx|texas)|" |
| + "(ut|utah)|" |
| + "(va|virginia)|" |
| + "(vi|virgin[" + SP + "]+islands)|" |
| + "(vt|vermont)|" |
| + "(wa|washington)|" |
| + "(wi|wisconsin)|" |
| + "(wv|west[" + SP + "]+virginia)|" |
| + "(wy|wyoming)" |
| + ")" + WORD_END, |
| Pattern.CASE_INSENSITIVE); |
| |
| private static final Pattern sLocationNameRe = Pattern.compile("(?:" |
| + "alley|annex|arcade|ave[.]?|avenue|alameda|bayou|" |
| + "beach|bend|bluffs?|bottom|boulevard|branch|bridge|" |
| + "brooks?|burgs?|bypass|broadway|camino|camp|canyon|" |
| + "cape|causeway|centers?|circles?|cliffs?|club|common|" |
| + "corners?|course|courts?|coves?|creek|crescent|crest|" |
| + "crossing|crossroad|curve|circulo|dale|dam|divide|" |
| + "drives?|estates?|expressway|extensions?|falls?|ferry|" |
| + "fields?|flats?|fords?|forest|forges?|forks?|fort|" |
| + "freeway|gardens?|gateway|glens?|greens?|groves?|" |
| + "harbors?|haven|heights|highway|hills?|hollow|inlet|" |
| + "islands?|isle|junctions?|keys?|knolls?|lakes?|land|" |
| + "landing|lane|lights?|loaf|locks?|lodge|loop|mall|" |
| + "manors?|meadows?|mews|mills?|mission|motorway|mount|" |
| + "mountains?|neck|orchard|oval|overpass|parks?|" |
| + "parkways?|pass|passage|path|pike|pines?|plains?|" |
| + "plaza|points?|ports?|prairie|privada|radial|ramp|" |
| + "ranch|rapids?|rd[.]?|rest|ridges?|river|roads?|route|" |
| + "row|rue|run|shoals?|shores?|skyway|springs?|spurs?|" |
| + "squares?|station|stravenue|stream|st[.]?|streets?|" |
| + "summit|speedway|terrace|throughway|trace|track|" |
| + "trafficway|trail|tunnel|turnpike|underpass|unions?|" |
| + "valleys?|viaduct|views?|villages?|ville|vista|walks?|" |
| + "wall|ways?|wells?|xing|xrd)" + WORD_END, |
| Pattern.CASE_INSENSITIVE); |
| |
| private static final Pattern sSuffixedNumberRe = |
| Pattern.compile("(\\d+)(st|nd|rd|th)", Pattern.CASE_INSENSITIVE); |
| |
| private static final Pattern sZipCodeRe = |
| Pattern.compile("(?:\\d{5}(?:-\\d{4})?)" + WORD_END, Pattern.CASE_INSENSITIVE); |
| |
| private static boolean checkHouseNumber(String houseNumber) { |
| // Make sure that there are at most 5 digits. |
| int digitCount = 0; |
| for (int i = 0; i < houseNumber.length(); ++i) { |
| if (Character.isDigit(houseNumber.charAt(i))) ++digitCount; |
| } |
| if (digitCount > 5) return false; |
| |
| // Make sure that any ordinals are valid. |
| Matcher suffixMatcher = sSuffixedNumberRe.matcher(houseNumber); |
| while (suffixMatcher.find()) { |
| int num = Integer.parseInt(suffixMatcher.group(1)); |
| if (num == 0) { |
| return false; // 0th is invalid. |
| } |
| String suffix = suffixMatcher.group(2).toLowerCase(Locale.getDefault()); |
| switch (num % 10) { |
| case 1: |
| return suffix.equals(num % 100 == 11 ? "th" : "st"); |
| case 2: |
| return suffix.equals(num % 100 == 12 ? "th" : "nd"); |
| case 3: |
| return suffix.equals(num % 100 == 13 ? "th" : "rd"); |
| default: |
| return suffix.equals("th"); |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * Attempt to match a house number beginnning at position offset |
| * in content. The house number must be followed by a word |
| * delimiter or the end of the string, and if offset is non-zero, |
| * then it must also be preceded by a word delimiter. |
| * |
| * @return a MatchResult if a valid house number was found. |
| */ |
| @VisibleForTesting |
| public static MatchResult matchHouseNumber(String content, int offset) { |
| if (offset > 0 && HOUSE_PRE_DELIM.indexOf(content.charAt(offset - 1)) == -1) return null; |
| Matcher matcher = sHouseNumberRe.matcher(content).region(offset, content.length()); |
| if (matcher.lookingAt()) { |
| MatchResult matchResult = matcher.toMatchResult(); |
| if (checkHouseNumber(matchResult.group(0))) return matchResult; |
| } |
| return null; |
| } |
| |
| /** |
| * Attempt to match a US state beginnning at position offset in |
| * content. The matching state must be followed by a word |
| * delimiter or the end of the string, and if offset is non-zero, |
| * then it must also be preceded by a word delimiter. |
| * |
| * @return a MatchResult if a valid US state (or two letter code) |
| * was found. |
| */ |
| @VisibleForTesting |
| public static MatchResult matchState(String content, int offset) { |
| if (offset > 0 && WORD_DELIM.indexOf(content.charAt(offset - 1)) == -1) return null; |
| Matcher stateMatcher = sStateRe.matcher(content).region(offset, content.length()); |
| return stateMatcher.lookingAt() ? stateMatcher.toMatchResult() : null; |
| } |
| |
| /** |
| * Test whether zipCode matches the U.S. zip code format (ddddd or |
| * ddddd-dddd) and is within the expected range, given that |
| * stateMatch is a match of sStateRe. |
| * |
| * @return true if zipCode is a valid zip code, is legal for the |
| * matched state, and is followed by a word delimiter or the end |
| * of the string. |
| */ |
| private static boolean isValidZipCode(String zipCode, MatchResult stateMatch) { |
| if (stateMatch == null) return false; |
| // Work out the index of the state, based on which group matched. |
| int stateIndex = stateMatch.groupCount(); |
| while (stateIndex > 0) { |
| if (stateMatch.group(stateIndex--) != null) break; |
| } |
| return sZipCodeRe.matcher(zipCode).matches() |
| && sStateZipCodeRanges[stateIndex].matches(zipCode); |
| } |
| |
| /** |
| * Test whether zipCode matches the U.S. zip code format (ddddd or |
| * ddddd-dddd) and is within the expected range, given that |
| * state holds a string that will match sStateRe. |
| * |
| * @return true if zipCode is a valid zip code, is legal for the |
| * given state, and is followed by a word delimiter or the end |
| * of the string. |
| */ |
| @VisibleForTesting |
| public static boolean isValidZipCode(String zipCode, String state) { |
| return isValidZipCode(zipCode, matchState(state, 0)); |
| } |
| |
| /** |
| * Test whether zipCode matches the U.S. zip code format (ddddd or ddddd-dddd). |
| * |
| * @return true if zipCode is a valid zip code followed by a word |
| * delimiter or the end of the string. |
| */ |
| @VisibleForTesting |
| public static boolean isValidZipCode(String zipCode) { |
| return sZipCodeRe.matcher(zipCode).matches(); |
| } |
| |
| /** |
| * Test whether location is one of the valid locations. |
| * |
| * @return true if location starts with a valid location name |
| * followed by a word delimiter or the end of the string. |
| */ |
| @VisibleForTesting |
| public static boolean isValidLocationName(String location) { |
| return sLocationNameRe.matcher(location).matches(); |
| } |
| |
| /** |
| * Attempt to match a complete address in content, starting with |
| * houseNumberMatch. |
| * |
| * @param content The string to search. |
| * @param houseNumberMatch A matching house number to start extending. |
| * @return +ve: the end of the match |
| * +ve: the position to restart searching for house numbers, negated. |
| */ |
| private static int attemptMatch(String content, MatchResult houseNumberMatch) { |
| int restartPos = -1; |
| int nonZipMatch = -1; |
| int it = houseNumberMatch.end(); |
| int numLines = 1; |
| boolean consecutiveHouseNumbers = true; |
| boolean foundLocationName = false; |
| int wordCount = 1; |
| String lastWord = ""; |
| |
| Matcher matcher = sWordRe.matcher(content); |
| |
| for (; it < content.length(); lastWord = matcher.group(0), it = matcher.end()) { |
| if (!matcher.find(it)) { |
| // No more words in the input sequence. |
| return -content.length(); |
| } |
| if (matcher.end() - matcher.start() > kMaxAddressNameWordLength) { |
| // Word is too long to be part of an address. Fail. |
| return -matcher.end(); |
| } |
| |
| // Count the number of newlines we just consumed. |
| while (it < matcher.start()) { |
| if (NL.indexOf(content.charAt(it++)) != -1) ++numLines; |
| } |
| |
| // Consumed too many lines. Fail. |
| if (numLines > MAX_ADDRESS_LINES) break; |
| |
| // Consumed too many words. Fail. |
| if (++wordCount > MAX_ADDRESS_WORDS) break; |
| |
| if (matchHouseNumber(content, it) != null) { |
| if (consecutiveHouseNumbers && numLines > 1) { |
| // Last line ended with a number, and this this line starts with one. |
| // Restart at this number. |
| return -it; |
| } |
| // Remember the position of this match as the restart position. |
| if (restartPos == -1) restartPos = it; |
| continue; |
| } |
| |
| consecutiveHouseNumbers = false; |
| |
| if (isValidLocationName(matcher.group(0))) { |
| foundLocationName = true; |
| continue; |
| } |
| |
| if (wordCount == MAX_LOCATION_NAME_DISTANCE && !foundLocationName) { |
| // Didn't find a location name in time. Fail. |
| it = matcher.end(); |
| break; |
| } |
| |
| if (foundLocationName && wordCount > MIN_ADDRESS_WORDS) { |
| // We can now attempt to match a state. |
| MatchResult stateMatch = matchState(content, it); |
| if (stateMatch != null) { |
| if (lastWord.equals("et") && stateMatch.group(0).equals("al")) { |
| // Reject "et al" as a false postitive. |
| it = stateMatch.end(); |
| break; |
| } |
| |
| // At this point we've matched a state; try to match a zip code after it. |
| Matcher zipMatcher = sWordRe.matcher(content); |
| if (zipMatcher.find(stateMatch.end()) |
| && isValidZipCode(zipMatcher.group(0), stateMatch)) { |
| return zipMatcher.end(); |
| } |
| // The content ends with a state but no zip |
| // code. This is a legal match according to the |
| // documentation. N.B. This differs from the |
| // original c++ implementation, which only allowed |
| // the zip code to be optional at the end of the |
| // string, which presumably is a bug. Now we |
| // prefer to find a match with a zip code, but |
| // remember non-zip matches and return them if |
| // necessary. |
| nonZipMatch = stateMatch.end(); |
| } |
| } |
| } |
| |
| if (nonZipMatch > 0) return nonZipMatch; |
| |
| return -(restartPos > 0 ? restartPos : it); |
| } |
| |
| /** |
| * Return the first matching address in content. |
| * |
| * @param content The string to search. |
| * @return The first valid address, or null if no address was matched. |
| */ |
| static String findAddress(String content) { |
| Matcher houseNumberMatcher = sHouseNumberRe.matcher(content); |
| int start = 0; |
| while (houseNumberMatcher.find(start)) { |
| if (checkHouseNumber(houseNumberMatcher.group(0))) { |
| start = houseNumberMatcher.start(); |
| int end = attemptMatch(content, houseNumberMatcher); |
| if (end > 0) { |
| return content.substring(start, end); |
| } |
| start = -end; |
| } else { |
| start = houseNumberMatcher.end(); |
| } |
| } |
| return null; |
| } |
| |
| private FindAddress() { |
| } |
| } |