blob: 46dded609522e39e6013d4af99ffe4d2fd9246f6 [file] [log] [blame]
/*
* Copyright (C) 2009 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License
*/
package com.android.providers.contacts;
import java.util.Arrays;
/**
* A string distance calculator, particularly suited for name matching.
* <p>
* A detailed discussion of the topic of record linkage in general and name matching
* in particular can be found in this article:
* <blockquote>
* Winkler, W. E. (2006). "Overview of Record Linkage and Current Research Directions".
* Research Report Series, RRS.
* </blockquote>
*/
public class NameDistance {
private static final float WINKLER_BONUS_THRESHOLD = 0.7f;
private static final int MIN_EXACT_PREFIX_LENGTH = 3;
private final int mMaxLength;
private final boolean mPrefixOnly;
private final boolean[] mMatchFlags1;
private final boolean[] mMatchFlags2;
/**
* Constructor.
*
* @param maxLength byte arrays are truncated if longer than this number
*/
public NameDistance(int maxLength) {
mMaxLength = maxLength;
mPrefixOnly = false;
mMatchFlags1 = new boolean[maxLength];
mMatchFlags2 = new boolean[maxLength];
}
/**
* Constructor for a matcher that only checks if one string is the exact prefix of the other
*/
public NameDistance() {
mPrefixOnly = true;
mMaxLength = 0;
mMatchFlags1 = mMatchFlags2 = null;
}
/**
* Computes a string distance between two normalized strings passed as byte arrays.
*/
public float getDistance(byte bytes1[], byte bytes2[]) {
byte[] array1, array2;
if (bytes1.length > bytes2.length) {
array2 = bytes1;
array1 = bytes2;
} else {
array2 = bytes2;
array1 = bytes1;
}
int length1 = array1.length;
if (length1 >= MIN_EXACT_PREFIX_LENGTH) {
boolean prefix = true;
for (int i = 0; i < array1.length; i++) {
if (array1[i] != array2[i]) {
prefix = false;
break;
}
}
if (prefix) {
return 1.0f;
}
}
if (mPrefixOnly) {
return 0;
}
if (length1 > mMaxLength) {
length1 = mMaxLength;
}
int length2 = array2.length;
if (length2 > mMaxLength) {
length2 = mMaxLength;
}
Arrays.fill(mMatchFlags1, 0, length1, false);
Arrays.fill(mMatchFlags2, 0, length2, false);
int range = length2 / 2 - 1;
if (range < 0) {
range = 0;
}
int matches = 0;
for (int i = 0; i < length1; i++) {
byte c1 = array1[i];
int from = i - range;
if (from < 0) {
from = 0;
}
int to = i + range + 1;
if (to > length2) {
to = length2;
}
for (int j = from; j < to; j++) {
if (!mMatchFlags2[j] && c1 == array2[j]) {
mMatchFlags1[i] = mMatchFlags2[j] = true;
matches++;
break;
}
}
}
if (matches == 0) {
return 0f;
}
int transpositions = 0;
int j = 0;
for (int i = 0; i < length1; i++) {
if (mMatchFlags1[i]) {
while (!mMatchFlags2[j]) {
j++;
}
if (array1[i] != array2[j]) {
transpositions++;
}
j++;
}
}
float m = matches;
float jaro = ((m / length1 + m / length2 + (m - (transpositions / 2)) / m)) / 3;
if (jaro < WINKLER_BONUS_THRESHOLD) {
return jaro;
}
// Add Winkler bonus
int prefix = 0;
for (int i = 0; i < length1; i++) {
if (bytes1[i] != bytes2[i]) {
break;
}
prefix++;
}
return jaro + Math.min(0.1f, 1f / length2) * prefix * (1 - jaro);
}
}