lang_id/custom-tokenizer.cc - platform/external/libtextclassifier - Git at Google

 /*
  * Copyright (C) 2017 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "lang_id/custom-tokenizer.h"

 #include <ctype.h>

 #include <string>

 #include "util/strings/utf8.h"

 namespace libtextclassifier {
 namespace nlp_core {
 namespace lang_id {

 namespace {
 inline bool IsTokenSeparator(int num_bytes, const char *curr) {
   if (num_bytes != 1) {
     return false;
   }
   return !isalpha(*curr);
 }
 }  // namespace

 const char *GetSafeEndOfString(const char *data, size_t size) {
   const char *const hard_end = data + size;
   const char *curr = data;
   while (curr < hard_end) {
     int num_bytes = GetNumBytesForUTF8Char(curr);
     if (num_bytes == 0) {
       break;
     }
     const char *new_curr = curr + num_bytes;
     if (new_curr > hard_end) {
       return curr;
     }
     curr = new_curr;
   }
   return curr;
 }

 void TokenizeTextForLangId(const std::string &text, LightSentence *sentence) {
   const char *const start = text.data();
   const char *curr = start;
   const char *end = GetSafeEndOfString(start, text.size());

   // Corner case: empty safe part of the text.
   if (curr >= end) {
     return;
   }

   // Number of bytes for UTF8 character starting at *curr.  Note: the loop below
   // is guaranteed to terminate because in each iteration, we move curr by at
   // least num_bytes, and num_bytes is guaranteed to be > 0.
   int num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
   while (curr < end) {
     // Jump over consecutive token separators.
     while (IsTokenSeparator(num_bytes, curr)) {
       curr += num_bytes;
       if (curr >= end) {
         return;
       }
       num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
     }

     // If control reaches this point, we are at beginning of a non-empty token.
     std::string *word = sentence->add_word();

     // Add special token-start character.
     word->push_back('^');

     // Add UTF8 characters to word, until we hit the end of the safe text or a
     // token separator.
     while (true) {
       word->append(curr, num_bytes);
       curr += num_bytes;
       if (curr >= end) {
         break;
       }
       num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
       if (IsTokenSeparator(num_bytes, curr)) {
         curr += num_bytes;
         num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
         break;
       }
     }
     word->push_back('$');

     // Note: we intentionally do not token.set_start()/end(), as those fields
     // are not used by the langid model.
   }
 }

 }  // namespace lang_id
 }  // namespace nlp_core
 }  // namespace libtextclassifier
	/*
	* Copyright (C) 2017 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "lang_id/custom-tokenizer.h"

	#include <ctype.h>

	#include <string>

	#include "util/strings/utf8.h"

	namespace libtextclassifier {
	namespace nlp_core {
	namespace lang_id {

	namespace {
	inline bool IsTokenSeparator(int num_bytes, const char *curr) {
	if (num_bytes != 1) {
	return false;
	}
	return !isalpha(*curr);
	}
	} // namespace

	const char GetSafeEndOfString(const char data, size_t size) {
	const char *const hard_end = data + size;
	const char *curr = data;
	while (curr < hard_end) {
	int num_bytes = GetNumBytesForUTF8Char(curr);
	if (num_bytes == 0) {
	break;
	}
	const char *new_curr = curr + num_bytes;
	if (new_curr > hard_end) {
	return curr;
	}
	curr = new_curr;
	}
	return curr;
	}

	void TokenizeTextForLangId(const std::string &text, LightSentence *sentence) {
	const char *const start = text.data();
	const char *curr = start;
	const char *end = GetSafeEndOfString(start, text.size());

	// Corner case: empty safe part of the text.
	if (curr >= end) {
	return;
	}

	// Number of bytes for UTF8 character starting at *curr. Note: the loop below
	// is guaranteed to terminate because in each iteration, we move curr by at
	// least num_bytes, and num_bytes is guaranteed to be > 0.
	int num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
	while (curr < end) {
	// Jump over consecutive token separators.
	while (IsTokenSeparator(num_bytes, curr)) {
	curr += num_bytes;
	if (curr >= end) {
	return;
	}
	num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
	}

	// If control reaches this point, we are at beginning of a non-empty token.
	std::string *word = sentence->add_word();

	// Add special token-start character.
	word->push_back('^');

	// Add UTF8 characters to word, until we hit the end of the safe text or a
	// token separator.
	while (true) {
	word->append(curr, num_bytes);
	curr += num_bytes;
	if (curr >= end) {
	break;
	}
	num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
	if (IsTokenSeparator(num_bytes, curr)) {
	curr += num_bytes;
	num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
	break;
	}
	}
	word->push_back('$');

	// Note: we intentionally do not token.set_start()/end(), as those fields
	// are not used by the langid model.
	}
	}

	} // namespace lang_id
	} // namespace nlp_core
	} // namespace libtextclassifier