native/utils/tokenizer-utils.h - platform/external/libtextclassifier - Git at Google

 /*
  * Copyright (C) 2018 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // Utilities for tests.

 #ifndef LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_
 #define LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_

 #include <string>

 #include "annotator/types.h"
 #include "utils/codepoint-range.h"
 #include "utils/strings/utf8.h"
 #include "utils/utf8/unicodetext.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"

 namespace libtextclassifier3 {

 struct FilterResult {
   // Whether split on this codepoint.
   bool to_split;
   // If the codepoint is used to split the text, whether to output it as a
   // token.
   bool to_keep;
 };

 // Returns a list of Tokens for a given input string, by tokenizing on space.
 std::vector<Token> TokenizeOnSpace(const std::string& text);

 // Returns a list of Tokens for a given input string, by tokenizing on the
 // given set of delimiter codepoints.
 // If create_tokens_for_non_space_delimiters is true, create tokens for
 // delimiters which are not white spaces. For example "This, is" -> {"This",
 // ",", "is"}.
 std::vector<Token> TokenizeOnDelimiters(
     const std::string& text, const absl::flat_hash_set<char32>& delimiters,
     bool create_tokens_for_non_space_delimiters = false);

 // This replicates how the original bert_tokenizer from the tflite-support
 // library pretokenize text by using regex_split with these default regexes.
 // It splits the text on spaces, punctuations and chinese characters and
 // output all the tokens except spaces.
 // So far, the only difference between this and the original implementation
 // we are aware of is that the original regexes has 8 ranges of chinese
 // unicodes. We have all these 8 ranges plus two extra ranges.
 std::vector<Token> TokenizeOnWhiteSpacePunctuationAndChineseLetter(
     const absl::string_view text);

 // Returns a list of Tokens for a given input string, by tokenizing on the
 // given filter function. Caller can control which codepoint to split and
 // whether a delimiter should be output as a token.
 template <typename FilterFn>
 std::vector<Token> TokenizeWithFilter(const absl::string_view input,
                                       FilterFn filter) {
   const UnicodeText input_unicode = UTF8ToUnicodeText(input, /*do_copy=*/false);
   std::vector<Token> tokens;
   UnicodeText::const_iterator start_it = input_unicode.begin();
   int token_start_codepoint = 0;
   int codepoint_idx = 0;

   for (auto it = input_unicode.begin(); it != input_unicode.end(); ++it) {
     const char32 code_point = *it;
     FilterResult filter_result = filter(code_point);
     if (filter_result.to_split) {
       const std::string token_text = UnicodeText::UTF8Substring(start_it, it);
       if (!token_text.empty()) {
         tokens.push_back(
             Token{token_text, token_start_codepoint, codepoint_idx});
       }
       if (filter_result.to_keep) {
         const std::string delimiter =
             UnicodeText::UTF8Substring(it, std::next(it));
         tokens.push_back(Token{delimiter, codepoint_idx, codepoint_idx + 1});
       }
       start_it = std::next(it);
       token_start_codepoint = codepoint_idx + 1;
     }
     codepoint_idx++;
   }
   // Flush the last token if any.
   if (start_it != input_unicode.end()) {
     const std::string token_text =
         UnicodeText::UTF8Substring(start_it, input_unicode.end());
     tokens.push_back(Token{token_text, token_start_codepoint, codepoint_idx});
   }
   return tokens;
 }

 }  // namespace  libtextclassifier3

 #endif  // LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_
	/*
	* Copyright (C) 2018 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// Utilities for tests.

	#ifndef LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_
	#define LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_

	#include <string>

	#include "annotator/types.h"
	#include "utils/codepoint-range.h"
	#include "utils/strings/utf8.h"
	#include "utils/utf8/unicodetext.h"
	#include "absl/container/flat_hash_set.h"
	#include "absl/strings/string_view.h"

	namespace libtextclassifier3 {

	struct FilterResult {
	// Whether split on this codepoint.
	bool to_split;
	// If the codepoint is used to split the text, whether to output it as a
	// token.
	bool to_keep;
	};

	// Returns a list of Tokens for a given input string, by tokenizing on space.
	std::vector<Token> TokenizeOnSpace(const std::string& text);

	// Returns a list of Tokens for a given input string, by tokenizing on the
	// given set of delimiter codepoints.
	// If create_tokens_for_non_space_delimiters is true, create tokens for
	// delimiters which are not white spaces. For example "This, is" -> {"This",
	// ",", "is"}.
	std::vector<Token> TokenizeOnDelimiters(
	const std::string& text, const absl::flat_hash_set<char32>& delimiters,
	bool create_tokens_for_non_space_delimiters = false);

	// This replicates how the original bert_tokenizer from the tflite-support
	// library pretokenize text by using regex_split with these default regexes.
	// It splits the text on spaces, punctuations and chinese characters and
	// output all the tokens except spaces.
	// So far, the only difference between this and the original implementation
	// we are aware of is that the original regexes has 8 ranges of chinese
	// unicodes. We have all these 8 ranges plus two extra ranges.
	std::vector<Token> TokenizeOnWhiteSpacePunctuationAndChineseLetter(
	const absl::string_view text);

	// Returns a list of Tokens for a given input string, by tokenizing on the
	// given filter function. Caller can control which codepoint to split and
	// whether a delimiter should be output as a token.
	template <typename FilterFn>
	std::vector<Token> TokenizeWithFilter(const absl::string_view input,
	FilterFn filter) {
	const UnicodeText input_unicode = UTF8ToUnicodeText(input, /do_copy=/false);
	std::vector<Token> tokens;
	UnicodeText::const_iterator start_it = input_unicode.begin();
	int token_start_codepoint = 0;
	int codepoint_idx = 0;

	for (auto it = input_unicode.begin(); it != input_unicode.end(); ++it) {
	const char32 code_point = *it;
	FilterResult filter_result = filter(code_point);
	if (filter_result.to_split) {
	const std::string token_text = UnicodeText::UTF8Substring(start_it, it);
	if (!token_text.empty()) {
	tokens.push_back(
	Token{token_text, token_start_codepoint, codepoint_idx});
	}
	if (filter_result.to_keep) {
	const std::string delimiter =
	UnicodeText::UTF8Substring(it, std::next(it));
	tokens.push_back(Token{delimiter, codepoint_idx, codepoint_idx + 1});
	}
	start_it = std::next(it);
	token_start_codepoint = codepoint_idx + 1;
	}
	codepoint_idx++;
	}
	// Flush the last token if any.
	if (start_it != input_unicode.end()) {
	const std::string token_text =
	UnicodeText::UTF8Substring(start_it, input_unicode.end());
	tokens.push_back(Token{token_text, token_start_codepoint, codepoint_idx});
	}
	return tokens;
	}

	} // namespace libtextclassifier3

	#endif // LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_