blob: 46a64b2ac81fe7eeb0fa4fa768dca6355a9e65cf [file] [log] [blame] [edit]
/*
* Copyright (C) 2018 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "lang_id/custom-tokenizer.h"
#include <ctype.h>
#include <string>
#include "lang_id/common/lite_base/attributes.h"
#include "lang_id/common/lite_base/logging.h"
#include "lang_id/common/utf8.h"
#include "utf.h"
namespace libtextclassifier3 {
namespace mobile {
namespace lang_id {
namespace {
inline bool IsTokenSeparator(int num_bytes, const char *curr) {
if (num_bytes != 1) {
return false;
}
return !isalpha(*curr);
}
// Appends to *word the UTF8 encoding for the lowercase version of the UTF8
// character that starts at |curr| and has |num_bytes| bytes.
//
// NOTE: if the current UTF8 character does not have a lowercase version, then
// we append the original UTF8 character.
inline SAFTM_ATTRIBUTE_ALWAYS_INLINE void AppendLowerCase(const char *curr,
int num_bytes,
std::string *word) {
if (num_bytes == 1) {
// Optimize the ASCII case.
word->push_back(tolower(*curr));
return;
}
// Harder, general case.
//
// NOTE: for lowercasing, we use the utils from utf.h:
// charntorune + tolowerrune + runetochar. Unfortunately, that library does
// not contain any fast util for determining the number of bytes for the UTF8
// character that starts at a given address *without* converting to a full
// codepoint (like our utils::OneCharLen, which is used intensively by the
// rest of our code, including by the performance-critical char ngram
// feature). Hence, the rest of our code continues to use utils::OneCharLen,
// and here, when we append the bytes to *word, we make sure that's consistent
// with utils::OneCharLen.
// charntorune() below reads the UTF8 character that starts at curr (using at
// most num_bytes bytes) and stores the corresponding codepoint into rune.
Rune rune;
charntorune(&rune, curr, num_bytes);
if (rune != Runeerror) {
Rune lower = tolowerrune(rune);
char lower_buf[UTFmax];
runetochar(lower_buf, &lower);
// When appending the UTF8 bytes to word, we do not use the number of bytes
// returned by runetochar(); instead, we use utils::OneCharLen(), the same
// method used by the char ngram feature. We expect them to be equal, but
// just in case.
int lower_num_bytes = utils::OneCharLen(lower_buf);
// Using lower_num_bytes below is safe, because, by definition of UTFmax,
SAFTM_DCHECK_GE(UTFmax, 4);
// And, by implementation of utils::OneCharLen():
SAFTM_DCHECK_GT(lower_num_bytes, 0);
SAFTM_DCHECK_LE(lower_num_bytes, 4);
word->append(lower_buf, lower_num_bytes);
} else {
// There are sequences of bytes that charntorune() can't convert into a
// valid Rune (a special case is [0xEF, 0xBF, 0xBD], the UTF8 encoding for
// the U+FFFD special Unicode character, which is also the value of
// Runeerror). We keep those bytes unchanged.
word->append(curr, num_bytes);
}
}
} // namespace
void TokenizerForLangId::Setup(TaskContext *context) {
lowercase_input_ = context->Get("lang_id_lowercase_input", false);
}
void TokenizerForLangId::Tokenize(StringPiece text,
LightSentence *sentence) const {
const char *const start = text.data();
const char *curr = start;
const char *end = utils::GetSafeEndOfUtf8String(start, text.size());
// Corner case: the safe part of the text is empty ("").
if (curr >= end) {
return;
}
// Number of bytes for UTF8 character starting at *curr. Note: the loop below
// is guaranteed to terminate because in each iteration, we move curr by at
// least num_bytes, and num_bytes is guaranteed to be > 0.
int num_bytes = utils::OneCharLen(curr);
while (curr < end) {
// Jump over consecutive token separators.
while (IsTokenSeparator(num_bytes, curr)) {
curr += num_bytes;
if (curr >= end) {
return;
}
num_bytes = utils::OneCharLen(curr);
}
// If control reaches this point, we are at beginning of a non-empty token.
sentence->emplace_back();
std::string *word = &(sentence->back());
// Add special token-start character.
word->push_back('^');
// Add UTF8 characters to word, until we hit the end of the safe text or a
// token separator.
while (true) {
if (lowercase_input_) {
AppendLowerCase(curr, num_bytes, word);
} else {
word->append(curr, num_bytes);
}
curr += num_bytes;
if (curr >= end) {
break;
}
num_bytes = utils::OneCharLen(curr);
if (IsTokenSeparator(num_bytes, curr)) {
curr += num_bytes;
if (curr >= end) {
break;
}
num_bytes = utils::OneCharLen(curr);
break;
}
}
word->push_back('$');
}
}
} // namespace lang_id
} // namespace mobile
} // namespace nlp_saft