blob: 6fab796a253236e3ece24da0f0c8e1b460cc87ba [file] [log] [blame]
/*
* Copyright (C) 2018 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_
#define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_
#include <string>
#include "lang_id/common/fel/task-context.h"
#include "lang_id/common/lite_strings/stringpiece.h"
#include "lang_id/light-sentence.h"
namespace libtextclassifier3 {
namespace mobile {
namespace lang_id {
// Custom tokenizer for the LangId model.
class TokenizerForLangId {
public:
void Setup(TaskContext *context);
// Tokenizes |text|, placing the tokens into |sentence|. Customized for
// LangId. Currently (Sep 15, 2016) we tokenize on space, newline, tab, and
// any other 1-byte UTF8 character which is not a letter, ignore all empty
// tokens, and (for each of the remaining tokens) prepend "^" (special token
// begin marker) and append "$" (special token end marker).
//
// Tokens are stored into the "repeated Token token;" field of *sentence.
void Tokenize(StringPiece text, LightSentence *sentence) const;
private:
// If true, during tokenization, we use the lowercase version of each Unicode
// character from the text to tokenize. E.g., if this is true, the text "Foo
// bar" is tokenized as ["foo", "bar"]; otherwise, we get ["Foo", "bar"].
bool lowercase_input_ = false;
};
} // namespace lang_id
} // namespace mobile
} // namespace nlp_saft
#endif // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_