| // |
| // Copyright (C) 2018 The Android Open Source Project |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| |
| // Configuration for the text encoder op. |
| |
| namespace libtextclassifier3; |
| |
| enum SentencePieceMatcherType : byte { |
| MAPPED_TRIE = 0, |
| SORTED_STRING_TABLE = 1, |
| } |
| |
| table TextEncoderConfig { |
| // Code that is used as encoding of the start code. |
| start_code:int32 = 0; |
| |
| // Code that is used as encoding of the end code. |
| end_code:int32 = 1; |
| |
| // This value is added to all codes to make them not intersect with |
| // `start_code` and `end_code`. |
| encoding_offset:int32 = 2; |
| |
| // Code that is used for out-of-dictionary characters. |
| unknown_code:int32 = -1; |
| |
| // Penalty associated with the unknown code. |
| unknown_score:float; |
| |
| // Normalization options. |
| // Serialized normalization charsmap. |
| normalization_charsmap:string; |
| normalization_charsmap_values:string; |
| |
| // Whether to add dummy whitespace at the beginning of the text in order to |
| // treat "world" in "world" and "hello world" uniformly. |
| add_dummy_prefix:bool = true; |
| |
| // Whether to remove leading, trailing and duplicate internal whitespace. |
| remove_extra_whitespaces:bool = true; |
| |
| // Whether to replace whitespace with a meta symbol. |
| escape_whitespaces:bool = true; |
| |
| // Sentence pieces scores. |
| pieces_scores:[float]; |
| |
| // Serialized sentence pieces. |
| pieces:string; |
| pieces_offsets:[int32]; |
| matcher_type: SentencePieceMatcherType = MAPPED_TRIE; |
| } |