utils/sentencepiece/encoder.cc - platform/external/libtextclassifier - Git at Google

 /*
  * Copyright (C) 2018 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "utils/sentencepiece/encoder.h"

 namespace libtextclassifier3 {

 bool Encoder::Encode(StringPiece normalized_text,
                      std::vector<int>* encoded_text) const {
   const int len = normalized_text.size();
   if (len <= 0) {
     *encoded_text = {start_code_, end_code_};
     return true;
   }
   // We use `previous_pos` to indicate whether a dynamic programming state was
   // reachable.
   std::vector<SegmentationEntry> segmentation(
       len + 1, {/*score=*/0, /*previous_pos=*/-1, /*piece_id=*/-1,
                 /*num_pieces=*/0});
   for (int i = 0; i < len; i++) {
     // State couldn't be reached.
     if (i > 0 && segmentation[i].previous_pos < 0) {
       // Advance position.
       normalized_text.RemovePrefix(1);
       continue;
     }
     // Check whether we can use the unknown token.
     if (unknown_code_ >= 0) {
       const int pos = i + 1;
       const float unknown_penalty = segmentation[i].score + unknown_score_;
       if (segmentation[pos].previous_pos < 0 ||
           segmentation[pos].score < unknown_penalty) {
         // Merge multiple unknown tokens into one.
         if (segmentation[i].piece_id == unknown_code_) {
           segmentation[pos] = {/*score=*/unknown_penalty,
                                /*previous_pos=*/segmentation[i].previous_pos,
                                /*piece_id=*/unknown_code_,
                                /*num_pieces=*/segmentation[i].num_pieces};
         } else {
           segmentation[pos] = {/*score=*/unknown_penalty,
                                /*previous_pos=*/i,
                                /*piece_id=*/unknown_code_,
                                /*num_pieces=*/segmentation[i].num_pieces + 1};
         }
       }
     }
     std::vector<TrieMatch> matches;
     if (!matcher_->FindAllPrefixMatches(normalized_text, &matches)) {
       TC3_LOG(ERROR)
           << "Couldn't successfully gather prefix sentence piece matches.";
       return false;
     }
     for (const auto& match : matches) {
       TC3_CHECK(match.id >= 0 && match.id < num_pieces_);
       const int pos = i + match.match_length;
       const float candidate_score = segmentation[i].score + scores_[match.id];
       if (segmentation[pos].previous_pos < 0 ||
           segmentation[pos].score < candidate_score) {
         segmentation[pos] = {/*score=*/candidate_score, /*previous_pos=*/i,
                              /*piece_id=*/match.id + encoding_offset_,
                              /*num_pieces=*/segmentation[i].num_pieces + 1};
       }
     }
     // Advance position.
     normalized_text.RemovePrefix(1);
   }
   if (segmentation[len].num_pieces <= 0) {
     *encoded_text = {start_code_, end_code_};
     return true;
   }
   const int num_pieces = segmentation[len].num_pieces;
   encoded_text->resize(num_pieces + 2);
   (*encoded_text)[num_pieces + 1] = end_code_;
   int pos = len;
   for (int i = num_pieces; i > 0; i--) {
     (*encoded_text)[i] = segmentation[pos].piece_id;
     pos = segmentation[pos].previous_pos;
   }
   (*encoded_text)[0] = start_code_;
   return true;
 }

 }  // namespace libtextclassifier3
	/*
	* Copyright (C) 2018 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "utils/sentencepiece/encoder.h"

	namespace libtextclassifier3 {

	bool Encoder::Encode(StringPiece normalized_text,
	std::vector<int>* encoded_text) const {
	const int len = normalized_text.size();
	if (len <= 0) {
	*encoded_text = {start_code_, end_code_};
	return true;
	}
	// We use `previous_pos` to indicate whether a dynamic programming state was
	// reachable.
	std::vector<SegmentationEntry> segmentation(
	len + 1, {/score=/0, /previous_pos=/-1, /piece_id=/-1,
	/num_pieces=/0});
	for (int i = 0; i < len; i++) {
	// State couldn't be reached.
	if (i > 0 && segmentation[i].previous_pos < 0) {
	// Advance position.
	normalized_text.RemovePrefix(1);
	continue;
	}
	// Check whether we can use the unknown token.
	if (unknown_code_ >= 0) {
	const int pos = i + 1;
	const float unknown_penalty = segmentation[i].score + unknown_score_;
	if (segmentation[pos].previous_pos < 0 \|\|
	segmentation[pos].score < unknown_penalty) {
	// Merge multiple unknown tokens into one.
	if (segmentation[i].piece_id == unknown_code_) {
	segmentation[pos] = {/score=/unknown_penalty,
	/previous_pos=/segmentation[i].previous_pos,
	/piece_id=/unknown_code_,
	/num_pieces=/segmentation[i].num_pieces};
	} else {
	segmentation[pos] = {/score=/unknown_penalty,
	/previous_pos=/i,
	/piece_id=/unknown_code_,
	/num_pieces=/segmentation[i].num_pieces + 1};
	}
	}
	}
	std::vector<TrieMatch> matches;
	if (!matcher_->FindAllPrefixMatches(normalized_text, &matches)) {
	TC3_LOG(ERROR)
	<< "Couldn't successfully gather prefix sentence piece matches.";
	return false;
	}
	for (const auto& match : matches) {
	TC3_CHECK(match.id >= 0 && match.id < num_pieces_);
	const int pos = i + match.match_length;
	const float candidate_score = segmentation[i].score + scores_[match.id];
	if (segmentation[pos].previous_pos < 0 \|\|
	segmentation[pos].score < candidate_score) {
	segmentation[pos] = {/score=/candidate_score, /previous_pos=/i,
	/piece_id=/match.id + encoding_offset_,
	/num_pieces=/segmentation[i].num_pieces + 1};
	}
	}
	// Advance position.
	normalized_text.RemovePrefix(1);
	}
	if (segmentation[len].num_pieces <= 0) {
	*encoded_text = {start_code_, end_code_};
	return true;
	}
	const int num_pieces = segmentation[len].num_pieces;
	encoded_text->resize(num_pieces + 2);
	(*encoded_text)[num_pieces + 1] = end_code_;
	int pos = len;
	for (int i = num_pieces; i > 0; i--) {
	(*encoded_text)[i] = segmentation[pos].piece_id;
	pos = segmentation[pos].previous_pos;
	}
	(*encoded_text)[0] = start_code_;
	return true;
	}

	} // namespace libtextclassifier3