| /* |
| * Copyright (C) 2009 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License |
| */ |
| package com.android.providers.contacts; |
| |
| import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType; |
| |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.List; |
| |
| /** |
| * Logic for matching contacts' data and accumulating match scores. |
| */ |
| public class ContactMatcher { |
| |
| // Best possible match score |
| public static final int MAX_SCORE = 100; |
| |
| // Suggest to aggregate contacts if their match score is equal or greater than this threshold |
| public static final int SCORE_THRESHOLD_SUGGEST = 50; |
| |
| // Automatically aggregate contacts if their match score is equal or greater than this threshold |
| public static final int SCORE_THRESHOLD_PRIMARY = 70; |
| |
| // Automatically aggregate contacts if the match score is equal or greater than this threshold |
| // and there is a secondary match (phone number, email etc). |
| public static final int SCORE_THRESHOLD_SECONDARY = 50; |
| |
| // Score for missing data (as opposed to present data but a bad match) |
| private static final int NO_DATA_SCORE = -1; |
| |
| // Score for matching phone numbers |
| private static final int PHONE_MATCH_SCORE = 71; |
| |
| // Score for matching email addresses |
| private static final int EMAIL_MATCH_SCORE = 71; |
| |
| // Score for matching nickname |
| private static final int NICKNAME_MATCH_SCORE = 71; |
| |
| // Maximum number of characters in a name to be considered by the matching algorithm. |
| private static final int MAX_MATCHED_NAME_LENGTH = 30; |
| |
| // Scores a multiplied by this number to allow room for "fractional" scores |
| private static final int SCORE_SCALE = 1000; |
| |
| public static final int MATCHING_ALGORITHM_EXACT = 0; |
| public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1; |
| public static final int MATCHING_ALGORITHM_APPROXIMATE = 2; |
| |
| // Minimum edit distance between two names to be considered an approximate match |
| public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f; |
| |
| // Minimum edit distance between two email ids to be considered an approximate match |
| public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f; |
| |
| /** |
| * Name matching scores: a matrix by name type vs. candidate lookup type. |
| * For example, if the name type is "full name" while we are looking for a |
| * "full name", the score may be 99. If we are looking for a "nickname" but |
| * find "first name", the score may be 50 (see specific scores defined |
| * below.) |
| * <p> |
| * For approximate matching, we have a range of scores, let's say 40-70. Depending one how |
| * similar the two strings are, the score will be somewhere between 40 and 70, with the exact |
| * match producing the score of 70. The score may also be 0 if the similarity (distance) |
| * between the strings is below the threshold. |
| * <p> |
| * We use a string matching algorithm, which is particularly suited for |
| * name matching. See {@link NameDistance}. |
| */ |
| private static int[] sMinScore = |
| new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT]; |
| private static int[] sMaxScore = |
| new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT]; |
| |
| /* |
| * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE}, |
| * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are |
| * not! They are useful in three-way aggregation cases when we have, for example, both |
| * John Smith and Smith John. A third contact with the name John Smith should be aggregated |
| * with the former rather than the latter. This is why "reverse" matches have slightly lower |
| * scores than direct matches. |
| */ |
| static { |
| setScoreRange(NameLookupType.NAME_EXACT, |
| NameLookupType.NAME_EXACT, 99, 99); |
| setScoreRange(NameLookupType.NAME_VARIANT, |
| NameLookupType.NAME_VARIANT, 90, 90); |
| setScoreRange(NameLookupType.NAME_COLLATION_KEY, |
| NameLookupType.NAME_COLLATION_KEY, 50, 80); |
| |
| setScoreRange(NameLookupType.NAME_COLLATION_KEY, |
| NameLookupType.EMAIL_BASED_NICKNAME, 30, 60); |
| setScoreRange(NameLookupType.NAME_COLLATION_KEY, |
| NameLookupType.NICKNAME, 50, 60); |
| |
| setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, |
| NameLookupType.EMAIL_BASED_NICKNAME, 50, 60); |
| setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, |
| NameLookupType.NAME_COLLATION_KEY, 50, 60); |
| setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, |
| NameLookupType.NICKNAME, 50, 60); |
| |
| setScoreRange(NameLookupType.NICKNAME, |
| NameLookupType.NICKNAME, 50, 60); |
| setScoreRange(NameLookupType.NICKNAME, |
| NameLookupType.NAME_COLLATION_KEY, 50, 60); |
| setScoreRange(NameLookupType.NICKNAME, |
| NameLookupType.EMAIL_BASED_NICKNAME, 50, 60); |
| } |
| |
| /** |
| * Populates the cells of the score matrix and score span matrix |
| * corresponding to the {@code candidateNameType} and {@code nameType}. |
| */ |
| private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo) { |
| int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; |
| sMinScore[index] = scoreFrom; |
| sMaxScore[index] = scoreTo; |
| } |
| |
| /** |
| * Returns the lower range for the match score for the given {@code candidateNameType} and |
| * {@code nameType}. |
| */ |
| private static int getMinScore(int candidateNameType, int nameType) { |
| int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; |
| return sMinScore[index]; |
| } |
| |
| /** |
| * Returns the upper range for the match score for the given {@code candidateNameType} and |
| * {@code nameType}. |
| */ |
| private static int getMaxScore(int candidateNameType, int nameType) { |
| int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; |
| return sMaxScore[index]; |
| } |
| |
| /** |
| * Captures the max score and match count for a specific contact. Used in an |
| * contactId - MatchScore map. |
| */ |
| public static class MatchScore implements Comparable<MatchScore> { |
| private long mContactId; |
| private boolean mKeepIn; |
| private boolean mKeepOut; |
| private int mPrimaryScore; |
| private int mSecondaryScore; |
| private int mMatchCount; |
| |
| public MatchScore(long contactId) { |
| this.mContactId = contactId; |
| } |
| |
| public void reset(long contactId) { |
| this.mContactId = contactId; |
| mKeepIn = false; |
| mKeepOut = false; |
| mPrimaryScore = 0; |
| mSecondaryScore = 0; |
| mMatchCount = 0; |
| } |
| |
| public long getContactId() { |
| return mContactId; |
| } |
| |
| public void updatePrimaryScore(int score) { |
| if (score > mPrimaryScore) { |
| mPrimaryScore = score; |
| } |
| mMatchCount++; |
| } |
| |
| public void updateSecondaryScore(int score) { |
| if (score > mSecondaryScore) { |
| mSecondaryScore = score; |
| } |
| mMatchCount++; |
| } |
| |
| public void keepIn() { |
| mKeepIn = true; |
| } |
| |
| public void keepOut() { |
| mKeepOut = true; |
| } |
| |
| public int getScore() { |
| if (mKeepOut) { |
| return 0; |
| } |
| |
| if (mKeepIn) { |
| return MAX_SCORE; |
| } |
| |
| int score = (mPrimaryScore > mSecondaryScore ? mPrimaryScore : mSecondaryScore); |
| |
| // Ensure that of two contacts with the same match score the one with more matching |
| // data elements wins. |
| return score * SCORE_SCALE + mMatchCount; |
| } |
| |
| /** |
| * Descending order of match score. |
| */ |
| public int compareTo(MatchScore another) { |
| return another.getScore() - getScore(); |
| } |
| |
| @Override |
| public String toString() { |
| return mContactId + ": " + mPrimaryScore + "/" + mSecondaryScore + "(" + mMatchCount |
| + ")"; |
| } |
| } |
| |
| private final HashMap<Long, MatchScore> mScores = new HashMap<Long, MatchScore>(); |
| private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>(); |
| private int mScoreCount = 0; |
| |
| private final NameDistance mNameDistanceConservative = new NameDistance(); |
| private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH); |
| |
| private MatchScore getMatchingScore(long contactId) { |
| MatchScore matchingScore = mScores.get(contactId); |
| if (matchingScore == null) { |
| if (mScoreList.size() > mScoreCount) { |
| matchingScore = mScoreList.get(mScoreCount); |
| matchingScore.reset(contactId); |
| } else { |
| matchingScore = new MatchScore(contactId); |
| mScoreList.add(matchingScore); |
| } |
| mScoreCount++; |
| mScores.put(contactId, matchingScore); |
| } |
| return matchingScore; |
| } |
| |
| /** |
| * Checks if there is a match and updates the overall score for the |
| * specified contact for a discovered match. The new score is determined |
| * by the prior score, by the type of name we were looking for, the type |
| * of name we found and, if the match is approximate, the distance between the candidate and |
| * actual name. |
| */ |
| public void matchName(long contactId, int candidateNameType, String candidateName, |
| int nameType, String name, int algorithm) { |
| int maxScore = getMaxScore(candidateNameType, nameType); |
| if (maxScore == 0) { |
| return; |
| } |
| |
| if (candidateName.equals(name)) { |
| updatePrimaryScore(contactId, maxScore); |
| return; |
| } |
| |
| if (algorithm == MATCHING_ALGORITHM_EXACT) { |
| return; |
| } |
| |
| int minScore = getMinScore(candidateNameType, nameType); |
| if (minScore == maxScore) { |
| return; |
| } |
| |
| byte[] decodedCandidateName = Hex.decodeHex(candidateName); |
| byte[] decodedName = Hex.decodeHex(name); |
| |
| NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ? |
| mNameDistanceConservative : mNameDistanceApproximate; |
| |
| int score; |
| float distance = nameDistance.getDistance(decodedCandidateName, decodedName); |
| boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME |
| || nameType == NameLookupType.EMAIL_BASED_NICKNAME; |
| float threshold = emailBased |
| ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL |
| : APPROXIMATE_MATCH_THRESHOLD; |
| if (distance > threshold) { |
| score = (int)(minScore + (maxScore - minScore) * (1.0f - distance)); |
| } else { |
| score = 0; |
| } |
| |
| updatePrimaryScore(contactId, score); |
| } |
| |
| public void updateScoreWithPhoneNumberMatch(long contactId) { |
| updateSecondaryScore(contactId, PHONE_MATCH_SCORE); |
| } |
| |
| public void updateScoreWithEmailMatch(long contactId) { |
| updateSecondaryScore(contactId, EMAIL_MATCH_SCORE); |
| } |
| |
| public void updateScoreWithNicknameMatch(long contactId) { |
| updateSecondaryScore(contactId, NICKNAME_MATCH_SCORE); |
| } |
| |
| private void updatePrimaryScore(long contactId, int score) { |
| getMatchingScore(contactId).updatePrimaryScore(score); |
| } |
| |
| private void updateSecondaryScore(long contactId, int score) { |
| getMatchingScore(contactId).updateSecondaryScore(score); |
| } |
| |
| public void keepIn(long contactId) { |
| getMatchingScore(contactId).keepIn(); |
| } |
| |
| public void keepOut(long contactId) { |
| getMatchingScore(contactId).keepOut(); |
| } |
| |
| public void clear() { |
| mScores.clear(); |
| mScoreCount = 0; |
| } |
| |
| /** |
| * Returns a list of IDs for contacts that are matched on secondary data elements |
| * (phone number, email address, nickname). We still need to obtain the approximate |
| * primary score for those contacts to determine if any of them should be aggregated. |
| * <p> |
| * May return null. |
| */ |
| public List<Long> prepareSecondaryMatchCandidates(int threshold) { |
| ArrayList<Long> contactIds = null; |
| |
| for (int i = 0; i < mScoreCount; i++) { |
| MatchScore score = mScoreList.get(i); |
| if (score.mKeepOut) { |
| continue; |
| } |
| |
| int s = score.mSecondaryScore; |
| if (s >= threshold) { |
| if (contactIds == null) { |
| contactIds = new ArrayList<Long>(); |
| } |
| contactIds.add(score.mContactId); |
| } |
| score.mPrimaryScore = NO_DATA_SCORE; |
| } |
| return contactIds; |
| } |
| |
| /** |
| * Returns the contactId with the best match score over the specified threshold or -1 |
| * if no such contact is found. |
| */ |
| public long pickBestMatch(int threshold) { |
| long contactId = -1; |
| int maxScore = 0; |
| for (int i = 0; i < mScoreCount; i++) { |
| MatchScore score = mScoreList.get(i); |
| if (score.mKeepOut) { |
| continue; |
| } |
| |
| if (score.mKeepIn) { |
| return score.mContactId; |
| } |
| |
| int s = score.mPrimaryScore; |
| if (s == NO_DATA_SCORE) { |
| s = score.mSecondaryScore; |
| } |
| |
| if (s >= threshold && s > maxScore) { |
| contactId = score.mContactId; |
| maxScore = s; |
| } |
| } |
| return contactId; |
| } |
| |
| /** |
| * Returns matches in the order of descending score. |
| */ |
| public List<MatchScore> pickBestMatches(int threshold) { |
| int scaledThreshold = threshold * SCORE_SCALE; |
| List<MatchScore> matches = mScoreList.subList(0, mScoreCount); |
| Collections.sort(matches); |
| int count = 0; |
| for (int i = 0; i < mScoreCount; i++) { |
| MatchScore matchScore = matches.get(i); |
| if (matchScore.getScore() >= scaledThreshold) { |
| count++; |
| } else { |
| break; |
| } |
| } |
| |
| return matches.subList(0, count); |
| } |
| |
| @Override |
| public String toString() { |
| return mScoreList.toString(); |
| } |
| } |