| // |
| // Copyright (C) 2018 The Android Open Source Project |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| |
| include "annotator/entity-data.fbs"; |
| include "annotator/experimental/experimental.fbs"; |
| include "utils/codepoint-range.fbs"; |
| include "utils/container/bit-vector.fbs"; |
| include "utils/flatbuffers/flatbuffers.fbs"; |
| include "utils/grammar/rules.fbs"; |
| include "utils/intents/intent-config.fbs"; |
| include "utils/normalization.fbs"; |
| include "utils/resources.fbs"; |
| include "utils/tokenizer.fbs"; |
| include "utils/zlib/buffer.fbs"; |
| |
| file_identifier "TC2 "; |
| |
| // The possible model modes, represents a bit field. |
| namespace libtextclassifier3; |
| enum ModeFlag : int { |
| NONE = 0, |
| ANNOTATION = 1, |
| CLASSIFICATION = 2, |
| ANNOTATION_AND_CLASSIFICATION = 3, |
| SELECTION = 4, |
| ANNOTATION_AND_SELECTION = 5, |
| CLASSIFICATION_AND_SELECTION = 6, |
| ALL = 7, |
| } |
| |
| // Enum for specifying the annotation usecase. |
| namespace libtextclassifier3; |
| enum AnnotationUsecase : int { |
| // Results are optimized for Smart{Select,Share,Linkify}. |
| ANNOTATION_USECASE_SMART = 0, |
| // Smart{Select,Share,Linkify} |
| |
| // Results are optimized for using TextClassifier as an infrastructure that |
| // annotates as much as possible. |
| ANNOTATION_USECASE_RAW = 1, |
| } |
| |
| namespace libtextclassifier3; |
| enum DatetimeExtractorType : int { |
| UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0, |
| AM = 1, |
| PM = 2, |
| JANUARY = 3, |
| FEBRUARY = 4, |
| MARCH = 5, |
| APRIL = 6, |
| MAY = 7, |
| JUNE = 8, |
| JULY = 9, |
| AUGUST = 10, |
| SEPTEMBER = 11, |
| OCTOBER = 12, |
| NOVEMBER = 13, |
| DECEMBER = 14, |
| NEXT = 15, |
| NEXT_OR_SAME = 16, |
| LAST = 17, |
| NOW = 18, |
| TOMORROW = 19, |
| YESTERDAY = 20, |
| PAST = 21, |
| FUTURE = 22, |
| DAY = 23, |
| WEEK = 24, |
| MONTH = 25, |
| YEAR = 26, |
| MONDAY = 27, |
| TUESDAY = 28, |
| WEDNESDAY = 29, |
| THURSDAY = 30, |
| FRIDAY = 31, |
| SATURDAY = 32, |
| SUNDAY = 33, |
| DAYS = 34, |
| WEEKS = 35, |
| MONTHS = 36, |
| |
| // TODO(zilka): Make the following 3 values singular for consistency. |
| HOURS = 37, |
| |
| MINUTES = 38, |
| SECONDS = 39, |
| YEARS = 40, |
| DIGITS = 41, |
| SIGNEDDIGITS = 42, |
| ZERO = 43, |
| ONE = 44, |
| TWO = 45, |
| THREE = 46, |
| FOUR = 47, |
| FIVE = 48, |
| SIX = 49, |
| SEVEN = 50, |
| EIGHT = 51, |
| NINE = 52, |
| TEN = 53, |
| ELEVEN = 54, |
| TWELVE = 55, |
| THIRTEEN = 56, |
| FOURTEEN = 57, |
| FIFTEEN = 58, |
| SIXTEEN = 59, |
| SEVENTEEN = 60, |
| EIGHTEEN = 61, |
| NINETEEN = 62, |
| TWENTY = 63, |
| THIRTY = 64, |
| FORTY = 65, |
| FIFTY = 66, |
| SIXTY = 67, |
| SEVENTY = 68, |
| EIGHTY = 69, |
| NINETY = 70, |
| HUNDRED = 71, |
| THOUSAND = 72, |
| NOON = 73, |
| MIDNIGHT = 74, |
| } |
| |
| namespace libtextclassifier3; |
| enum DatetimeGroupType : int { |
| GROUP_UNKNOWN = 0, |
| GROUP_UNUSED = 1, |
| GROUP_YEAR = 2, |
| GROUP_MONTH = 3, |
| GROUP_DAY = 4, |
| GROUP_HOUR = 5, |
| GROUP_MINUTE = 6, |
| GROUP_SECOND = 7, |
| GROUP_AMPM = 8, |
| GROUP_RELATIONDISTANCE = 9, |
| GROUP_RELATION = 10, |
| GROUP_RELATIONTYPE = 11, |
| |
| // Dummy groups serve just as an inflator of the selection. E.g. we might want |
| // to select more text than was contained in an envelope of all extractor |
| // spans. |
| GROUP_DUMMY1 = 12, |
| |
| GROUP_DUMMY2 = 13, |
| GROUP_ABSOLUTETIME = 14, |
| } |
| |
| // Options for the model that predicts text selection. |
| namespace libtextclassifier3; |
| table SelectionModelOptions { |
| // If true, before the selection is returned, the unpaired brackets contained |
| // in the predicted selection are stripped from the both selection ends. |
| // The bracket codepoints are defined in the Unicode standard: |
| // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt |
| strip_unpaired_brackets:bool = true; |
| |
| // Number of hypothetical click positions on either side of the actual click |
| // to consider in order to enforce symmetry. |
| symmetry_context_size:int; |
| |
| // Number of examples to bundle in one batch for inference. |
| batch_size:int = 1024; |
| |
| // Whether to always classify a suggested selection or only on demand. |
| always_classify_suggested_selection:bool = false; |
| } |
| |
| // Options for the model that classifies a text selection. |
| namespace libtextclassifier3; |
| table ClassificationModelOptions { |
| // Limits for phone numbers. |
| phone_min_num_digits:int = 7; |
| |
| phone_max_num_digits:int = 15; |
| |
| // Limits for addresses. |
| address_min_num_tokens:int; |
| |
| // Maximum number of tokens to attempt a classification (-1 is unlimited). |
| max_num_tokens:int = -1; |
| } |
| |
| // Options for post-checks, checksums and verification to apply on a match. |
| namespace libtextclassifier3; |
| table VerificationOptions { |
| verify_luhn_checksum:bool = false; |
| |
| // Lua verifier to use. |
| // Index of the lua verifier in the model. |
| lua_verifier:int = -1; |
| } |
| |
| // Behaviour of rule capturing groups. |
| // This specifies how the text and span of a capturing group, in a regular |
| // expression or from a capturing match in a grammar rule, should be handled. |
| namespace libtextclassifier3; |
| table CapturingGroup { |
| // If true, the span of the capturing group will be used to |
| // extend the selection. |
| extend_selection:bool = true; |
| |
| // If set, the text of the capturing group will be used to set a field in |
| // the classfication result entity data. |
| entity_field_path:FlatbufferFieldPath; |
| |
| // If set, the flatbuffer entity data will be merged with the |
| // classification result entity data. |
| serialized_entity_data:string (shared); |
| |
| // If set, normalization to apply before text is used in entity data. |
| normalization_options:NormalizationOptions; |
| |
| entity_data:EntityData; |
| } |
| |
| // List of regular expression matchers to check. |
| namespace libtextclassifier3.RegexModel_; |
| table Pattern { |
| // The name of the collection of a match. |
| collection_name:string (shared); |
| |
| // The pattern to check. |
| pattern:string (shared); |
| |
| // The modes for which to apply the patterns. |
| enabled_modes:ModeFlag = ALL; |
| |
| // The final score to assign to the results of this pattern. |
| target_classification_score:float = 1; |
| |
| // Priority score used for conflict resolution with the other models. |
| priority_score:float = 0; |
| |
| // If true, will use an approximate matching implementation implemented |
| // using Find() instead of the true Match(). This approximate matching will |
| // use the first Find() result and then check that it spans the whole input. |
| use_approximate_matching:bool = false; |
| |
| compressed_pattern:CompressedBuffer; |
| |
| // Verification to apply on a match. |
| verification_options:VerificationOptions; |
| |
| capturing_group:[CapturingGroup]; |
| |
| // Entity data to set for a match. |
| serialized_entity_data:string (shared); |
| |
| entity_data:EntityData; |
| } |
| |
| namespace libtextclassifier3; |
| table RegexModel { |
| patterns:[RegexModel_.Pattern]; |
| |
| // If true, will compile the regexes only on first use. |
| lazy_regex_compilation:bool = true; |
| |
| // Lua scripts for match verification. |
| // The verifier can access: |
| // * `context`: The context as a string. |
| // * `match`: The groups of the regex match as an array, each group gives |
| // * `begin`: span start |
| // * `end`: span end |
| // * `text`: the text |
| // The verifier is expected to return a boolean, indicating whether the |
| // verification succeeded or not. |
| lua_verifier:[string]; |
| } |
| |
| // List of regex patterns. |
| namespace libtextclassifier3.DatetimeModelPattern_; |
| table Regex { |
| pattern:string (shared); |
| |
| // The ith entry specifies the type of the ith capturing group. |
| // This is used to decide how the matched content has to be parsed. |
| groups:[DatetimeGroupType]; |
| |
| compressed_pattern:CompressedBuffer; |
| } |
| |
| namespace libtextclassifier3; |
| table DatetimeModelPattern { |
| regexes:[DatetimeModelPattern_.Regex]; |
| |
| // List of locale indices in DatetimeModel that represent the locales that |
| // these patterns should be used for. If empty, can be used for all locales. |
| locales:[int]; |
| |
| // The final score to assign to the results of this pattern. |
| target_classification_score:float = 1; |
| |
| // Priority score used for conflict resolution with the other models. |
| priority_score:float = 0; |
| |
| // The modes for which to apply the patterns. |
| enabled_modes:ModeFlag = ALL; |
| |
| // The annotation usecases for which to apply the patterns. |
| // This is a flag field for values of AnnotationUsecase. |
| enabled_annotation_usecases:uint = 4294967295; |
| } |
| |
| namespace libtextclassifier3; |
| table DatetimeModelExtractor { |
| extractor:DatetimeExtractorType; |
| pattern:string (shared); |
| locales:[int]; |
| compressed_pattern:CompressedBuffer; |
| } |
| |
| namespace libtextclassifier3; |
| table DatetimeModel { |
| // List of BCP 47 locale strings representing all locales supported by the |
| // model. The individual patterns refer back to them using an index. |
| locales:[string]; |
| |
| patterns:[DatetimeModelPattern]; |
| extractors:[DatetimeModelExtractor]; |
| |
| // If true, will use the extractors for determining the match location as |
| // opposed to using the location where the global pattern matched. |
| use_extractors_for_locating:bool = true; |
| |
| // List of locale ids, rules of whose are always run, after the requested |
| // ones. |
| default_locales:[int]; |
| |
| // If true, will generate the alternative interpretations for ambiguous |
| // datetime expressions. |
| generate_alternative_interpretations_when_ambiguous:bool = false; |
| |
| // If true, will compile the regexes only on first use. |
| lazy_regex_compilation:bool = true; |
| |
| // If true, will give only future dates (when the day is not specified). |
| prefer_future_for_unspecified_date:bool = false; |
| } |
| |
| // Configuration for the tokenizer. |
| namespace libtextclassifier3; |
| table GrammarTokenizerOptions { |
| tokenization_type:TokenizationType = ICU; |
| |
| // If true, white space tokens will be kept when using the icu tokenizer. |
| icu_preserve_whitespace_tokens:bool = false; |
| |
| // Codepoint ranges that determine what role the different codepoints play |
| // during tokenized. The ranges must not overlap. |
| tokenization_codepoint_config:[TokenizationCodepointRange]; |
| |
| // A set of codepoint ranges to use in the mixed tokenization mode to identify |
| // stretches of tokens to re-tokenize using the internal tokenizer. |
| internal_tokenizer_codepoint_ranges:[CodepointRange]; |
| |
| // If true, tokens will be also split when the codepoint's script_id changes |
| // as defined in TokenizationCodepointRange. |
| tokenize_on_script_change:bool = false; |
| } |
| |
| namespace libtextclassifier3.DatetimeModelLibrary_; |
| table Item { |
| key:string (shared); |
| value:DatetimeModel; |
| } |
| |
| // A set of named DateTime models. |
| namespace libtextclassifier3; |
| table DatetimeModelLibrary { |
| models:[DatetimeModelLibrary_.Item]; |
| } |
| |
| // Classification result to instantiate for a rule match. |
| namespace libtextclassifier3.GrammarModel_; |
| table RuleClassificationResult { |
| // The name of the collection. |
| collection_name:string (shared); |
| |
| // The score. |
| target_classification_score:float = 1; |
| |
| // The priority score used for conflict resolution with the other models. |
| priority_score:float = 0; |
| |
| // Behaviour of capturing matches. |
| capturing_group:[CapturingGroup]; |
| |
| // Entity data to set for a match. |
| serialized_entity_data:string (shared); |
| |
| // Enabled modes. |
| enabled_modes:ModeFlag = ALL; |
| |
| entity_data:EntityData; |
| } |
| |
| // Configuration for grammar based annotators. |
| namespace libtextclassifier3; |
| table GrammarModel { |
| // The grammar rules. |
| rules:grammar.RulesSet; |
| |
| rule_classification_result:[GrammarModel_.RuleClassificationResult]; |
| |
| // Number of tokens in the context to use for classification and text |
| // selection suggestion. |
| // A value -1 uses the full context. |
| context_left_num_tokens:int; |
| |
| context_right_num_tokens:int; |
| |
| // Grammar specific tokenizer options. |
| tokenizer_options:GrammarTokenizerOptions; |
| |
| // The score. |
| target_classification_score:float = 1; |
| |
| // The priority score used for conflict resolution with the other models. |
| priority_score:float = 1; |
| } |
| |
| namespace libtextclassifier3.MoneyParsingOptions_; |
| table QuantitiesNameToExponentEntry { |
| key:string (key, shared); |
| value:int; |
| } |
| |
| namespace libtextclassifier3; |
| table MoneyParsingOptions { |
| // Separators (codepoints) marking decimal or thousand in the money amount. |
| separators:[int]; |
| |
| // Mapping between a quantity string (e.g. "million") and the power of 10 |
| // it multiplies the amount with (e.g. 6 in case of "million"). |
| // NOTE: The entries need to be sorted by key since we use LookupByKey. |
| quantities_name_to_exponent:[MoneyParsingOptions_.QuantitiesNameToExponentEntry]; |
| } |
| |
| namespace libtextclassifier3.ModelTriggeringOptions_; |
| table CollectionToPriorityEntry { |
| key:string (key, shared); |
| value:float; |
| } |
| |
| // Options controlling the output of the Tensorflow Lite models. |
| namespace libtextclassifier3; |
| table ModelTriggeringOptions { |
| // Lower bound threshold for filtering annotation model outputs. |
| min_annotate_confidence:float = 0; |
| |
| // The modes for which to enable the models. |
| enabled_modes:ModeFlag = ALL; |
| |
| // Comma-separated list of locales (BCP 47 tags) that dictionary |
| // classification supports. |
| dictionary_locales:string (shared); |
| |
| // Comma-separated list of locales (BCP 47 tags) that the model supports, that |
| // are used to prevent triggering on input in unsupported languages. If |
| // empty, the model will trigger on all inputs. |
| locales:string (shared); |
| |
| // Priority score assigned to the "other" class from ML model. |
| other_collection_priority_score:float = -1000; |
| |
| // Priority score assigned to knowledge engine annotations. |
| knowledge_priority_score:float = 0; |
| reserved_7:int16 (deprecated); |
| |
| // Apply a factor to the priority score for entities that are added to this |
| // map. Key: collection type e.g. "address", "phone"..., Value: float number. |
| // NOTE: The entries here need to be sorted since we use LookupByKey. |
| collection_to_priority:[ModelTriggeringOptions_.CollectionToPriorityEntry]; |
| } |
| |
| // Options controlling the output of the classifier. |
| namespace libtextclassifier3; |
| table OutputOptions { |
| // Lists of collection names that will be filtered out at the output: |
| // - For annotation, the spans of given collection are simply dropped. |
| // - For classification, the result is mapped to the class "other". |
| // - For selection, the spans of given class are returned as |
| // single-selection. |
| filtered_collections_annotation:[string]; |
| |
| filtered_collections_classification:[string]; |
| filtered_collections_selection:[string]; |
| } |
| |
| namespace libtextclassifier3.Model_; |
| table EmbeddingPruningMask { |
| // If true, use pruning mask. In this case, we use mask |
| // pruning_mask to determine the mapping of hashed-charactergrams. |
| enabled:bool; |
| |
| // Packing of the binary pruning mask into uint64 values. |
| pruning_mask:[ulong] (force_align: 16); |
| |
| // Number of buckets before pruning. |
| full_num_buckets:int; |
| |
| // Index of row of compressed embedding matrix to which all pruned buckets |
| // are mapped. |
| pruned_row_bucket_id:int; |
| } |
| |
| namespace libtextclassifier3.Model_; |
| table ConflictResolutionOptions { |
| // If true, will prioritize the longest annotation during conflict |
| // resolution. |
| prioritize_longest_annotation:bool = false; |
| |
| // If true, the annotator will perform conflict resolution between the |
| // different sub-annotators also in the RAW mode. If false, no conflict |
| // resolution will be performed in RAW mode. |
| do_conflict_resolution_in_raw_mode:bool = true; |
| } |
| |
| namespace libtextclassifier3; |
| table Model { |
| // Comma-separated list of locales supported by the model as BCP 47 tags. |
| locales:string (shared); |
| |
| version:int; |
| |
| // A name for the model that can be used for e.g. logging. |
| name:string (shared); |
| |
| selection_feature_options:FeatureProcessorOptions; |
| classification_feature_options:FeatureProcessorOptions; |
| |
| // Tensorflow Lite models. |
| selection_model:[ubyte] (force_align: 16); |
| |
| classification_model:[ubyte] (force_align: 16); |
| embedding_model:[ubyte] (force_align: 16); |
| |
| // Options for the different models. |
| selection_options:SelectionModelOptions; |
| |
| classification_options:ClassificationModelOptions; |
| regex_model:RegexModel; |
| datetime_model:DatetimeModel; |
| |
| // Options controlling the output of the models. |
| triggering_options:ModelTriggeringOptions; |
| |
| // Global switch that controls if SuggestSelection(), ClassifyText() and |
| // Annotate() will run. If a mode is disabled it returns empty/no-op results. |
| enabled_modes:ModeFlag = ALL; |
| |
| // If true, will snap the selections that consist only of whitespaces to the |
| // containing suggested span. Otherwise, no suggestion is proposed, since the |
| // selections are not part of any token. |
| snap_whitespace_selections:bool = true; |
| |
| // Global configuration for the output of SuggestSelection(), ClassifyText() |
| // and Annotate(). |
| output_options:OutputOptions; |
| |
| // Configures how Intents should be generated on Android. |
| android_intent_options:AndroidIntentFactoryOptions; |
| |
| intent_options:IntentFactoryModel; |
| |
| // Model resources. |
| resources:ResourcePool; |
| |
| // Schema data for handling entity data. |
| entity_data_schema:[ubyte]; |
| |
| number_annotator_options:NumberAnnotatorOptions; |
| duration_annotator_options:DurationAnnotatorOptions; |
| |
| // Comma-separated list of locales (BCP 47 tags) that the model supports, that |
| // are used to prevent triggering on input in unsupported languages. If |
| // empty, the model will trigger on all inputs. |
| triggering_locales:string (shared); |
| |
| embedding_pruning_mask:Model_.EmbeddingPruningMask; |
| reserved_25:int16 (deprecated); |
| contact_annotator_options:ContactAnnotatorOptions; |
| money_parsing_options:MoneyParsingOptions; |
| translate_annotator_options:TranslateAnnotatorOptions; |
| grammar_model:GrammarModel; |
| conflict_resolution_options:Model_.ConflictResolutionOptions; |
| experimental_model:ExperimentalModel; |
| pod_ner_model:PodNerModel; |
| vocab_model:VocabModel; |
| datetime_grammar_model:GrammarModel; |
| } |
| |
| // Method for selecting the center token. |
| namespace libtextclassifier3.FeatureProcessorOptions_; |
| enum CenterTokenSelectionMethod : int { |
| DEFAULT_CENTER_TOKEN_METHOD = 0, |
| // Invalid option. |
| |
| // Use click indices to determine the center token. |
| CENTER_TOKEN_FROM_CLICK = 1, |
| |
| // Use selection indices to get a token range, and select the middle of it |
| // as the center token. |
| CENTER_TOKEN_MIDDLE_OF_SELECTION = 2, |
| } |
| |
| // Bounds-sensitive feature extraction configuration. |
| namespace libtextclassifier3.FeatureProcessorOptions_; |
| table BoundsSensitiveFeatures { |
| // Enables the extraction of bounds-sensitive features, instead of the click |
| // context features. |
| enabled:bool; |
| |
| // The numbers of tokens to extract in specific locations relative to the |
| // bounds. |
| // Immediately before the span. |
| num_tokens_before:int; |
| |
| // Inside the span, aligned with the beginning. |
| num_tokens_inside_left:int; |
| |
| // Inside the span, aligned with the end. |
| num_tokens_inside_right:int; |
| |
| // Immediately after the span. |
| num_tokens_after:int; |
| |
| // If true, also extracts the tokens of the entire span and adds up their |
| // features forming one "token" to include in the extracted features. |
| include_inside_bag:bool; |
| |
| // If true, includes the selection length (in the number of tokens) as a |
| // feature. |
| include_inside_length:bool; |
| |
| // If true, for selection, single token spans are not run through the model |
| // and their score is assumed to be zero. |
| score_single_token_spans_as_zero:bool; |
| } |
| |
| namespace libtextclassifier3; |
| table FeatureProcessorOptions { |
| // Number of buckets used for hashing charactergrams. |
| num_buckets:int = -1; |
| |
| // Size of the embedding. |
| embedding_size:int = -1; |
| |
| // Number of bits for quantization for embeddings. |
| embedding_quantization_bits:int = 8; |
| |
| // Context size defines the number of words to the left and to the right of |
| // the selected word to be used as context. For example, if context size is |
| // N, then we take N words to the left and N words to the right of the |
| // selected word as its context. |
| context_size:int = -1; |
| |
| // Maximum number of words of the context to select in total. |
| max_selection_span:int = -1; |
| |
| // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3 |
| // character trigrams etc. |
| chargram_orders:[int]; |
| |
| // Maximum length of a word, in codepoints. |
| max_word_length:int = 20; |
| |
| // If true, will use the unicode-aware functionality for extracting features. |
| unicode_aware_features:bool = false; |
| |
| // Whether to extract the token case feature. |
| extract_case_feature:bool = false; |
| |
| // Whether to extract the selection mask feature. |
| extract_selection_mask_feature:bool = false; |
| |
| // List of regexps to run over each token. For each regexp, if there is a |
| // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used. |
| regexp_feature:[string]; |
| |
| // Whether to remap all digits to a single number. |
| remap_digits:bool = false; |
| |
| // Whether to lower-case each token before generating hashgrams. |
| lowercase_tokens:bool; |
| |
| // If true, the selection classifier output will contain only the selections |
| // that are feasible (e.g., those that are shorter than max_selection_span), |
| // if false, the output will be a complete cross-product of possible |
| // selections to the left and possible selections to the right, including the |
| // infeasible ones. |
| // NOTE: Exists mainly for compatibility with older models that were trained |
| // with the non-reduced output space. |
| selection_reduced_output_space:bool = true; |
| |
| // Collection names. |
| collections:[string]; |
| |
| // An index of collection in collections to be used if a collection name can't |
| // be mapped to an id. |
| default_collection:int = -1; |
| |
| // If true, will split the input by lines, and only use the line that contains |
| // the clicked token. |
| only_use_line_with_click:bool = false; |
| |
| // If true, will split tokens that contain the selection boundary, at the |
| // position of the boundary. |
| // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com" |
| split_tokens_on_selection_boundaries:bool = false; |
| |
| // Codepoint ranges that determine how different codepoints are tokenized. |
| // The ranges must not overlap. |
| tokenization_codepoint_config:[TokenizationCodepointRange]; |
| |
| center_token_selection_method:FeatureProcessorOptions_.CenterTokenSelectionMethod; |
| |
| // If true, span boundaries will be snapped to containing tokens and not |
| // required to exactly match token boundaries. |
| snap_label_span_boundaries_to_containing_tokens:bool; |
| |
| // A set of codepoint ranges supported by the model. |
| supported_codepoint_ranges:[CodepointRange]; |
| |
| // A set of codepoint ranges to use in the mixed tokenization mode to identify |
| // stretches of tokens to re-tokenize using the internal tokenizer. |
| internal_tokenizer_codepoint_ranges:[CodepointRange]; |
| |
| // Minimum ratio of supported codepoints in the input context. If the ratio |
| // is lower than this, the feature computation will fail. |
| min_supported_codepoint_ratio:float = 0; |
| |
| // Used for versioning the format of features the model expects. |
| // - feature_version == 0: |
| // For each token the features consist of: |
| // - chargram embeddings |
| // - dense features |
| // Chargram embeddings for tokens are concatenated first together, |
| // and at the end, the dense features for the tokens are concatenated |
| // to it. So the resulting feature vector has two regions. |
| feature_version:int = 0; |
| |
| tokenization_type:TokenizationType = INTERNAL_TOKENIZER; |
| icu_preserve_whitespace_tokens:bool = false; |
| |
| // List of codepoints that will be stripped from beginning and end of |
| // predicted spans. |
| ignored_span_boundary_codepoints:[int]; |
| |
| bounds_sensitive_features:FeatureProcessorOptions_.BoundsSensitiveFeatures; |
| |
| // List of allowed charactergrams. The extracted charactergrams are filtered |
| // using this list, and charactergrams that are not present are interpreted as |
| // out-of-vocabulary. |
| // If no allowed_chargrams are specified, all charactergrams are allowed. |
| // The field is typed as bytes type to allow non-UTF8 chargrams. |
| allowed_chargrams:[string]; |
| |
| // If true, tokens will be also split when the codepoint's script_id changes |
| // as defined in TokenizationCodepointRange. |
| tokenize_on_script_change:bool = false; |
| |
| // If true, the pipe character '|' will be used as a newline character when |
| // splitting lines. |
| use_pipe_character_for_newline:bool = true; |
| } |
| |
| namespace libtextclassifier3; |
| table NumberAnnotatorOptions { |
| // If true, number and percentage annotations will be produced. |
| enabled:bool = false; |
| |
| // Score to assign to the annotated numbers and percentages in the annotator. |
| score:float = 1; |
| |
| // Number priority score used for conflict resolution with the other models. |
| priority_score:float = 0; |
| |
| // The modes in which to enable number and percentage annotations. |
| enabled_modes:ModeFlag = ALL; |
| |
| // The annotation usecases for which to produce number annotations. |
| // This is a flag field for values of AnnotationUsecase. |
| enabled_annotation_usecases:uint = 4294967295; |
| |
| // [Deprecated] A list of codepoints that can form a prefix of a valid number. |
| allowed_prefix_codepoints:[int]; |
| |
| // [Deprecated] A list of codepoints that can form a suffix of a valid number. |
| allowed_suffix_codepoints:[int]; |
| |
| // [Deprecated] List of codepoints that will be stripped from beginning of |
| // predicted spans. |
| ignored_prefix_span_boundary_codepoints:[int]; |
| |
| // [Deprecated] List of codepoints that will be stripped from end of predicted |
| // spans. |
| ignored_suffix_span_boundary_codepoints:[int]; |
| |
| // [Deprecated] If true, percent annotations will be produced. |
| enable_percentage:bool = false; |
| |
| // Zero separated and ordered list of suffixes that mark a percent. |
| percentage_pieces_string:string (shared); |
| |
| // [Deprecated] List of suffixes offsets in the percent_pieces_string string. |
| percentage_pieces_offsets:[int]; |
| |
| // Priority score for the percentage annotation. |
| percentage_priority_score:float = 1; |
| |
| // Float number priority score used for conflict resolution with the other |
| // models. |
| float_number_priority_score:float = 0; |
| |
| // The maximum number of digits an annotated number can have. Requirement: |
| // the value should be less or equal to 20. |
| max_number_of_digits:int = 20; |
| |
| // The annotation usecases for which to produce percentage annotations. |
| // This is a flag field for values of AnnotationUsecase. |
| percentage_annotation_usecases:uint = 2; |
| } |
| |
| // DurationAnnotator is so far tailored for English and Japanese only. |
| namespace libtextclassifier3; |
| table DurationAnnotatorOptions { |
| // If true, duration annotations will be produced. |
| enabled:bool = false; |
| |
| // Score to assign to the annotated durations from the annotator. |
| score:float = 1; |
| |
| // Priority score used for conflict resolution with the other models. |
| priority_score:float = 0; |
| |
| // The modes in which to enable duration annotations. |
| enabled_modes:ModeFlag = ALL; |
| |
| // The annotation usecases for which to produce duration annotations. |
| enabled_annotation_usecases:uint = 4294967295; |
| |
| // Durations typically look like XX hours and XX minutes etc... The list of |
| // strings below enumerate variants of "hours", "minutes", etc. in these |
| // expressions. These are verbatim strings that are matched against tokens in |
| // the input. |
| week_expressions:[string]; |
| |
| day_expressions:[string]; |
| hour_expressions:[string]; |
| minute_expressions:[string]; |
| second_expressions:[string]; |
| |
| // List of expressions that doesn't break a duration expression (can become |
| // a part of it) but has not semantic meaning. |
| filler_expressions:[string]; |
| |
| // List of expressions that mean half of a unit of duration (e.g. "half an |
| // hour"). |
| half_expressions:[string]; |
| |
| // Set of condepoints that can split the Annotator tokens to sub-tokens for |
| // sub-token matching. |
| sub_token_separator_codepoints:[int]; |
| |
| // If this is true, unit must be associated with quantity. For example, a |
| // phrase "minute" is not parsed as one minute duration if this is true. |
| require_quantity:bool; |
| |
| // If this is true, dangling quantity is included in the annotation. For |
| // example, "10 minutes 20" is interpreted as 10 minutes and 20 seconds. |
| enable_dangling_quantity_interpretation:bool = true; |
| } |
| |
| namespace libtextclassifier3; |
| table ContactAnnotatorOptions { |
| // Supported for English genitives only so far. |
| enable_declension:bool; |
| |
| // For each language there is a customized list of supported declensions. |
| language:string (shared); |
| } |
| |
| namespace libtextclassifier3.TranslateAnnotatorOptions_; |
| enum Algorithm : int { |
| DEFAULT_ALGORITHM = 0, |
| BACKOFF = 1, |
| } |
| |
| // Backoff is the algorithm shipped with Android Q. |
| namespace libtextclassifier3.TranslateAnnotatorOptions_; |
| table BackoffOptions { |
| // The minimum size of text to prefer for detection (in codepoints). |
| min_text_size:int = 20; |
| |
| // For reducing the score when text is less than the preferred size. |
| penalize_ratio:float = 1; |
| |
| // Original detection score to surrounding text detection score ratios. |
| subject_text_score_ratio:float = 0.4; |
| } |
| |
| namespace libtextclassifier3; |
| table TranslateAnnotatorOptions { |
| enabled:bool = false; |
| |
| // Score to assign to the classification results. |
| score:float = 1; |
| |
| // Priority score used for conflict resolution with the other models. |
| priority_score:float; |
| |
| algorithm:TranslateAnnotatorOptions_.Algorithm; |
| backoff_options:TranslateAnnotatorOptions_.BackoffOptions; |
| } |
| |
| namespace libtextclassifier3.PodNerModel_; |
| table Collection { |
| // Collection's name (e.g., "location", "person"). |
| name:string (shared); |
| |
| // Priority scores used for conflict resolution with the other annotators |
| // when the annotation is made over a single/multi token text. |
| single_token_priority_score:float; |
| |
| multi_token_priority_score:float; |
| } |
| |
| namespace libtextclassifier3.PodNerModel_.Label_; |
| enum BoiseType : int { |
| NONE = 0, |
| BEGIN = 1, |
| O = 2, |
| // No label. |
| |
| INTERMEDIATE = 3, |
| SINGLE = 4, |
| END = 5, |
| } |
| |
| namespace libtextclassifier3.PodNerModel_.Label_; |
| enum MentionType : int { |
| UNDEFINED = 0, |
| NAM = 1, |
| NOM = 2, |
| } |
| |
| namespace libtextclassifier3.PodNerModel_; |
| table Label { |
| boise_type:Label_.BoiseType; |
| mention_type:Label_.MentionType; |
| collection_id:int; |
| // points to the collections array above. |
| } |
| |
| namespace libtextclassifier3; |
| table PodNerModel { |
| tflite_model:[ubyte]; |
| word_piece_vocab:[ubyte]; |
| lowercase_input:bool = true; |
| |
| // Index of mention_logits tensor in the output of the tflite model. Can |
| // be found in the textproto output after model is converted to tflite. |
| logits_index_in_output_tensor:int = 0; |
| |
| // Whether to append a period at the end of an input that doesn't already |
| // end in punctuation. |
| append_final_period:bool = false; |
| |
| // Priority score used for conflict resolution with the other models. Used |
| // only if collections_array is empty. |
| priority_score:float = 0; |
| |
| // Maximum number of wordpieces supported by the model. |
| max_num_wordpieces:int = 128; |
| |
| // In case of long text (number of wordpieces greater than the max) we use |
| // sliding window approach, this determines the number of overlapping |
| // wordpieces between two consecutive windows. This overlap enables context |
| // for each word NER annotates. |
| sliding_window_num_wordpieces_overlap:int = 20; |
| reserved_9:int16 (deprecated); |
| |
| // The possible labels the ner model can output. If empty the default labels |
| // will be used. |
| labels:[PodNerModel_.Label]; |
| |
| // If the ratio of unknown wordpieces in the input text is greater than this |
| // maximum, the text won't be annotated. |
| max_ratio_unknown_wordpieces:float = 0.1; |
| |
| // Possible collections for labeled entities. |
| collections:[PodNerModel_.Collection]; |
| |
| // Minimum word-length and wordpieces-length required for the text to be |
| // annotated. |
| min_number_of_tokens:int = 1; |
| |
| min_number_of_wordpieces:int = 1; |
| } |
| |
| namespace libtextclassifier3; |
| table VocabModel { |
| // A trie that stores a list of vocabs that triggers "Define". A id is |
| // returned when looking up a vocab from the trie and the id can be used |
| // to access more information about that vocab. The marisa trie library |
| // requires 8-byte alignment because the first thing in a marisa trie is a |
| // 64-bit integer. |
| vocab_trie:[ubyte] (force_align: 8); |
| |
| // A bit vector that tells if the vocab should trigger "Define" for users of |
| // beginner proficiency only. To look up the bit vector, use the id returned |
| // by the trie. |
| beginner_level:BitVectorData; |
| |
| // A sorted list of indices of vocabs that should not trigger "Define" if |
| // its leading character is in upper case. The indices are those returned by |
| // trie. You may perform binary search to look up an index. |
| do_not_trigger_in_upper_case:BitVectorData; |
| |
| // Comma-separated list of locales (BCP 47 tags) that the model supports, that |
| // are used to prevent triggering on input in unsupported languages. If |
| // empty, the model will trigger on all inputs. |
| triggering_locales:string (shared); |
| |
| // The final score to assign to the results of the vocab model |
| target_classification_score:float = 1; |
| |
| // Priority score used for conflict resolution with the other models. |
| priority_score:float = 0; |
| } |
| |
| root_type libtextclassifier3.Model; |