lang_id/custom-tokenizer.h - platform/external/libtextclassifier - Git at Google

 /*
  * Copyright (C) 2018 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_
 #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_

 #include <string>

 #include "lang_id/common/fel/task-context.h"
 #include "lang_id/common/lite_strings/stringpiece.h"
 #include "lang_id/light-sentence.h"

 namespace libtextclassifier3 {
 namespace mobile {
 namespace lang_id {

 // Custom tokenizer for the LangId model.
 class TokenizerForLangId {
  public:
   void Setup(TaskContext *context);

   // Tokenizes |text|, placing the tokens into |sentence|.  Customized for
   // LangId.  Currently (Sep 15, 2016) we tokenize on space, newline, tab, and
   // any other 1-byte UTF8 character which is not a letter, ignore all empty
   // tokens, and (for each of the remaining tokens) prepend "^" (special token
   // begin marker) and append "$" (special token end marker).
   //
   // Tokens are stored into the "repeated Token token;" field of *sentence.
   void Tokenize(StringPiece text, LightSentence *sentence) const;

  private:
   // If true, during tokenization, we use the lowercase version of each Unicode
   // character from the text to tokenize.  E.g., if this is true, the text "Foo
   // bar" is tokenized as ["foo", "bar"]; otherwise, we get ["Foo", "bar"].
   bool lowercase_input_ = false;
 };

 }  // namespace lang_id
 }  // namespace mobile
 }  // namespace nlp_saft

 #endif  // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_
	/*
	* Copyright (C) 2018 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_
	#define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_

	#include <string>

	#include "lang_id/common/fel/task-context.h"
	#include "lang_id/common/lite_strings/stringpiece.h"
	#include "lang_id/light-sentence.h"

	namespace libtextclassifier3 {
	namespace mobile {
	namespace lang_id {

	// Custom tokenizer for the LangId model.
	class TokenizerForLangId {
	public:
	void Setup(TaskContext *context);

	// Tokenizes \|text\|, placing the tokens into \|sentence\|. Customized for
	// LangId. Currently (Sep 15, 2016) we tokenize on space, newline, tab, and
	// any other 1-byte UTF8 character which is not a letter, ignore all empty
	// tokens, and (for each of the remaining tokens) prepend "^" (special token
	// begin marker) and append "$" (special token end marker).
	//
	// Tokens are stored into the "repeated Token token;" field of *sentence.
	void Tokenize(StringPiece text, LightSentence *sentence) const;

	private:
	// If true, during tokenization, we use the lowercase version of each Unicode
	// character from the text to tokenize. E.g., if this is true, the text "Foo
	// bar" is tokenized as ["foo", "bar"]; otherwise, we get ["Foo", "bar"].
	bool lowercase_input_ = false;
	};

	} // namespace lang_id
	} // namespace mobile
	} // namespace nlp_saft

	#endif // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_