blob: 9c6af909c5b0a6228eec5f16cee4259e56e53cb5 [file] [log] [blame]
/*
* Copyright (C) 2018 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "annotator/grammar/utils.h"
namespace libtextclassifier3 {
Tokenizer BuildTokenizer(const UniLib* unilib,
const GrammarTokenizerOptions* options) {
std::vector<const TokenizationCodepointRange*> codepoint_config;
if (options->tokenization_codepoint_config() != nullptr) {
codepoint_config.insert(codepoint_config.end(),
options->tokenization_codepoint_config()->begin(),
options->tokenization_codepoint_config()->end());
}
std::vector<const CodepointRange*> internal_codepoint_config;
if (options->internal_tokenizer_codepoint_ranges() != nullptr) {
internal_codepoint_config.insert(
internal_codepoint_config.end(),
options->internal_tokenizer_codepoint_ranges()->begin(),
options->internal_tokenizer_codepoint_ranges()->end());
}
const bool tokenize_on_script_change =
options->tokenization_codepoint_config() != nullptr &&
options->tokenize_on_script_change();
return Tokenizer(options->tokenization_type(), unilib, codepoint_config,
internal_codepoint_config, tokenize_on_script_change,
/*icu_preserve_whitespace_tokens=*/false);
}
} // namespace libtextclassifier3