blob: 3020136664fd3c00cd800e5b1b1b25d229d459b1 [file] [log] [blame]
/*
* Copyright (C) 2013 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
//#define LOG_NDEBUG 0
#define LOG_TAG "CharacterEncodingDector"
#include <utils/Log.h>
#include <CharacterEncodingDetector.h>
#include "CharacterEncodingDetectorTables.h"
#include "utils/Vector.h"
#include "StringArray.h"
#include "unicode/ucnv.h"
#include "unicode/ucsdet.h"
#include "unicode/ustring.h"
namespace android {
CharacterEncodingDetector::CharacterEncodingDetector() {
UErrorCode status = U_ZERO_ERROR;
mUtf8Conv = ucnv_open("UTF-8", &status);
if (U_FAILURE(status)) {
ALOGE("could not create UConverter for UTF-8");
mUtf8Conv = NULL;
}
}
CharacterEncodingDetector::~CharacterEncodingDetector() {
ucnv_close(mUtf8Conv);
}
void CharacterEncodingDetector::addTag(const char *name, const char *value) {
mNames.push_back(name);
mValues.push_back(value);
}
size_t CharacterEncodingDetector::size() {
return mNames.size();
}
status_t CharacterEncodingDetector::getTag(int index, const char **name, const char**value) {
if (index >= mNames.size()) {
return BAD_VALUE;
}
*name = mNames.getEntry(index);
*value = mValues.getEntry(index);
return OK;
}
static bool isPrintableAscii(const char *value, size_t len) {
for (size_t i = 0; i < len; i++) {
if ((value[i] & 0x80) || value[i] < 0x20 || value[i] == 0x7f) {
return false;
}
}
return true;
}
void CharacterEncodingDetector::detectAndConvert() {
int size = mNames.size();
ALOGV("%d tags before conversion", size);
for (int i = 0; i < size; i++) {
ALOGV("%s: %s", mNames.getEntry(i), mValues.getEntry(i));
}
if (size && mUtf8Conv) {
UErrorCode status = U_ZERO_ERROR;
UCharsetDetector *csd = ucsdet_open(&status);
const UCharsetMatch *ucm;
// try combined detection of artist/album/title etc.
char buf[1024];
buf[0] = 0;
bool allprintable = true;
for (int i = 0; i < size; i++) {
const char *name = mNames.getEntry(i);
const char *value = mValues.getEntry(i);
if (!isPrintableAscii(value, strlen(value)) && (
!strcmp(name, "artist") ||
!strcmp(name, "albumartist") ||
!strcmp(name, "composer") ||
!strcmp(name, "genre") ||
!strcmp(name, "album") ||
!strcmp(name, "title"))) {
strlcat(buf, value, sizeof(buf));
// separate tags by space so ICU's ngram detector can do its job
strlcat(buf, " ", sizeof(buf));
allprintable = false;
}
}
const char *combinedenc = "UTF-8";
if (allprintable) {
// since 'buf' is empty, ICU would return a UTF-8 matcher with low confidence, so
// no need to even call it
ALOGV("all tags are printable, assuming ascii (%zu)", strlen(buf));
} else {
ucsdet_setText(csd, buf, strlen(buf), &status);
int32_t matches;
const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status);
bool goodmatch = true;
int highest = 0;
const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf),
ucma, matches, &goodmatch, &highest);
ALOGV("goodmatch: %s, highest: %d", goodmatch ? "true" : "false", highest);
if (!goodmatch && (highest < 15 || strlen(buf) < 20)) {
ALOGV("not a good match, trying with more data");
// This string might be too short for ICU to do anything useful with.
// (real world example: "Björk" in ISO-8859-1 might be detected as GB18030, because
// the ISO detector reports a confidence of 0, while the GB18030 detector reports
// a confidence of 10 with no invalid characters)
// Append artist, album and title if they were previously omitted because they
// were printable ascii.
bool added = false;
for (int i = 0; i < size; i++) {
const char *name = mNames.getEntry(i);
const char *value = mValues.getEntry(i);
if (isPrintableAscii(value, strlen(value)) && (
!strcmp(name, "artist") ||
!strcmp(name, "album") ||
!strcmp(name, "title"))) {
strlcat(buf, value, sizeof(buf));
strlcat(buf, " ", sizeof(buf));
added = true;
}
}
if (added) {
ucsdet_setText(csd, buf, strlen(buf), &status);
ucma = ucsdet_detectAll(csd, &matches, &status);
bestCombinedMatch = getPreferred(buf, strlen(buf),
ucma, matches, &goodmatch, &highest);
if (!goodmatch && highest <= 15) {
ALOGV("still not a good match after adding printable tags");
bestCombinedMatch = NULL;
}
} else {
ALOGV("no printable tags to add");
}
}
if (bestCombinedMatch != NULL) {
combinedenc = ucsdet_getName(bestCombinedMatch, &status);
} else {
combinedenc = "ISO-8859-1";
}
}
for (int i = 0; i < size; i++) {
const char *name = mNames.getEntry(i);
uint8_t* src = (uint8_t *)mValues.getEntry(i);
int len = strlen((char *)src);
ALOGV("@@@ checking %s", name);
const char *s = mValues.getEntry(i);
int32_t inputLength = strlen(s);
const char *enc;
if (!allprintable && (!strcmp(name, "artist") ||
!strcmp(name, "albumartist") ||
!strcmp(name, "composer") ||
!strcmp(name, "genre") ||
!strcmp(name, "album") ||
!strcmp(name, "title"))) {
// use encoding determined from the combination of artist/album/title etc.
enc = combinedenc;
} else {
if (isPrintableAscii(s, inputLength)) {
enc = "UTF-8";
ALOGV("@@@@ %s is ascii", mNames.getEntry(i));
} else {
ucsdet_setText(csd, s, inputLength, &status);
ucm = ucsdet_detect(csd, &status);
if (!ucm) {
mValues.setEntry(i, "???");
continue;
}
enc = ucsdet_getName(ucm, &status);
ALOGV("@@@@ recognized charset: %s for %s confidence %d",
enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status));
}
}
if (strcmp(enc,"UTF-8") != 0) {
// only convert if the source encoding isn't already UTF-8
ALOGV("@@@ using converter %s for %s", enc, mNames.getEntry(i));
status = U_ZERO_ERROR;
UConverter *conv = ucnv_open(enc, &status);
if (U_FAILURE(status)) {
ALOGW("could not create UConverter for %s (%d), falling back to ISO-8859-1",
enc, status);
status = U_ZERO_ERROR;
conv = ucnv_open("ISO-8859-1", &status);
if (U_FAILURE(status)) {
ALOGW("could not create UConverter for ISO-8859-1 either");
continue;
}
}
// convert from native encoding to UTF-8
const char* source = mValues.getEntry(i);
int targetLength = len * 3 + 1;
char* buffer = new char[targetLength];
// don't normally check for NULL, but in this case targetLength may be large
if (!buffer)
break;
char* target = buffer;
ucnv_convertEx(mUtf8Conv, conv, &target, target + targetLength,
&source, source + strlen(source),
NULL, NULL, NULL, NULL, TRUE, TRUE, &status);
if (U_FAILURE(status)) {
ALOGE("ucnv_convertEx failed: %d", status);
mValues.setEntry(i, "???");
} else {
// zero terminate
*target = 0;
// strip trailing spaces
while (--target > buffer && *target == ' ') {
*target = 0;
}
// skip leading spaces
char *start = buffer;
while (*start == ' ') {
start++;
}
mValues.setEntry(i, start);
}
delete[] buffer;
ucnv_close(conv);
}
}
for (int i = size - 1; i >= 0; --i) {
if (strlen(mValues.getEntry(i)) == 0) {
ALOGV("erasing %s because entry is empty", mNames.getEntry(i));
mNames.erase(i);
mValues.erase(i);
}
}
ucsdet_close(csd);
}
}
/*
* When ICU detects multiple encoding matches, apply additional heuristics to determine
* which one is the best match, since ICU can't always be trusted to make the right choice.
*
* What this method does is:
* - decode the input using each of the matches found
* - recalculate the starting confidence level for multibyte encodings using a different
* algorithm and larger frequent character lists than ICU
* - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc)
* - pick the highest match
* - signal to the caller whether this match is considered good: confidence > 15, and confidence
* delta with the next runner up > 15
*/
const UCharsetMatch *CharacterEncodingDetector::getPreferred(
const char *input, size_t len,
const UCharsetMatch** ucma, size_t nummatches,
bool *goodmatch, int *highestmatch) {
*goodmatch = false;
Vector<const UCharsetMatch*> matches;
UErrorCode status = U_ZERO_ERROR;
ALOGV("%zu matches", nummatches);
for (size_t i = 0; i < nummatches; i++) {
const char *encname = ucsdet_getName(ucma[i], &status);
int confidence = ucsdet_getConfidence(ucma[i], &status);
ALOGV("%zu: %s %d", i, encname, confidence);
matches.push_back(ucma[i]);
}
size_t num = matches.size();
if (num == 0) {
return NULL;
}
if (num == 1) {
int confidence = ucsdet_getConfidence(matches[0], &status);
if (confidence > 15) {
*goodmatch = true;
}
return matches[0];
}
ALOGV("considering %zu matches", num);
// keep track of how many "special" characters result when converting the input using each
// encoding
Vector<int> newconfidence;
for (size_t i = 0; i < num; i++) {
const uint16_t *freqdata = NULL;
float freqcoverage = 0;
status = U_ZERO_ERROR;
const char *encname = ucsdet_getName(matches[i], &status);
int confidence = ucsdet_getConfidence(matches[i], &status);
if (!strcmp("GB18030", encname)) {
freqdata = frequent_zhCN;
freqcoverage = frequent_zhCN_coverage;
} else if (!strcmp("Big5", encname)) {
freqdata = frequent_zhTW;
freqcoverage = frequent_zhTW_coverage;
} else if (!strcmp("EUC-KR", encname)) {
freqdata = frequent_ko;
freqcoverage = frequent_ko_coverage;
} else if (!strcmp("EUC-JP", encname)) {
freqdata = frequent_ja;
freqcoverage = frequent_ja_coverage;
} else if (!strcmp("Shift_JIS", encname)) {
freqdata = frequent_ja;
freqcoverage = frequent_ja_coverage;
}
ALOGV("%zu: %s %d", i, encname, confidence);
status = U_ZERO_ERROR;
UConverter *conv = ucnv_open(encname, &status);
int demerit = 0;
if (U_FAILURE(status)) {
ALOGV("failed to open %s: %d", encname, status);
confidence = 0;
demerit += 1000;
}
const char *source = input;
const char *sourceLimit = input + len;
status = U_ZERO_ERROR;
int frequentchars = 0;
int totalchars = 0;
while (true) {
// demerit the current encoding for each "special" character found after conversion.
// The amount of demerit is somewhat arbitrarily chosen.
int inchar;
if (source != sourceLimit) {
inchar = (source[0] << 8) + source[1];
}
UChar32 c = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
if (!U_SUCCESS(status)) {
break;
}
if (c < 0x20 || (c >= 0x7f && c <= 0x009f)) {
ALOGV("control character %x", c);
demerit += 100;
} else if ((c == 0xa0) // no-break space
|| (c >= 0xa2 && c <= 0xbe) // symbols, superscripts
|| (c == 0xd7) || (c == 0xf7) // multiplication and division signs
|| (c >= 0x2000 && c <= 0x209f)) { // punctuation, superscripts
ALOGV("unlikely character %x", c);
demerit += 10;
} else if (c >= 0xe000 && c <= 0xf8ff) {
ALOGV("private use character %x", c);
demerit += 30;
} else if (c >= 0x2190 && c <= 0x2bff) {
// this range comprises various symbol ranges that are unlikely to appear in
// music file metadata.
ALOGV("symbol %x", c);
demerit += 10;
} else if (c == 0xfffd) {
ALOGV("replacement character");
demerit += 50;
} else if (c >= 0xfff0 && c <= 0xfffc) {
ALOGV("unicode special %x", c);
demerit += 50;
} else if (freqdata != NULL) {
totalchars++;
if (isFrequent(freqdata, c)) {
frequentchars++;
}
}
}
if (freqdata != NULL && totalchars != 0) {
int myconfidence = 10 + float((100 * frequentchars) / totalchars) / freqcoverage;
ALOGV("ICU confidence: %d, my confidence: %d (%d %d)", confidence, myconfidence,
totalchars, frequentchars);
if (myconfidence > 100) myconfidence = 100;
if (myconfidence < 0) myconfidence = 0;
confidence = myconfidence;
}
ALOGV("%d-%d=%d", confidence, demerit, confidence - demerit);
newconfidence.push_back(confidence - demerit);
ucnv_close(conv);
if (i == 0 && (confidence - demerit) == 100) {
// no need to check any further, we'll end up using this match anyway
break;
}
}
// find match with highest confidence after adjusting for unlikely characters
int highest = newconfidence[0];
size_t highestidx = 0;
int runnerup = -10000;
int runnerupidx = -10000;
num = newconfidence.size();
for (size_t i = 1; i < num; i++) {
if (newconfidence[i] > highest) {
runnerup = highest;
runnerupidx = highestidx;
highest = newconfidence[i];
highestidx = i;
} else if (newconfidence[i] > runnerup){
runnerup = newconfidence[i];
runnerupidx = i;
}
}
status = U_ZERO_ERROR;
ALOGV("selecting: '%s' w/ %d confidence",
ucsdet_getName(matches[highestidx], &status), highest);
if (runnerupidx < 0) {
ALOGV("no runner up");
if (highest > 15) {
*goodmatch = true;
}
} else {
ALOGV("runner up: '%s' w/ %d confidence",
ucsdet_getName(matches[runnerupidx], &status), runnerup);
if (runnerup < 0) {
runnerup = 0;
}
if ((highest - runnerup) > 15) {
*goodmatch = true;
}
}
*highestmatch = highest;
return matches[highestidx];
}
bool CharacterEncodingDetector::isFrequent(const uint16_t *values, uint32_t c) {
int start = 0;
int end = 511; // All the tables have 512 entries
int mid = (start+end)/2;
while(start <= end) {
if(c == values[mid]) {
return true;
} else if (c > values[mid]) {
start = mid + 1;
} else {
end = mid - 1;
}
mid = (start + end) / 2;
}
return false;
}
} // namespace android