model.fbs - platform/external/libtextclassifier - Git at Google

 //
 // Copyright (C) 2017 The Android Open Source Project
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //

 file_identifier "TC2 ";

 // The possible model modes, represents a bit field.
 namespace libtextclassifier2;
 enum ModeFlag : int {
   NONE = 0,
   ANNOTATION = 1,
   CLASSIFICATION = 2,
   ANNOTATION_AND_CLASSIFICATION = 3,
   SELECTION = 4,
   ANNOTATION_AND_SELECTION = 5,
   CLASSIFICATION_AND_SELECTION = 6,
   ALL = 7,
 }

 namespace libtextclassifier2;
 enum DatetimeExtractorType : int {
   UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0,
   AM = 1,
   PM = 2,
   JANUARY = 3,
   FEBRUARY = 4,
   MARCH = 5,
   APRIL = 6,
   MAY = 7,
   JUNE = 8,
   JULY = 9,
   AUGUST = 10,
   SEPTEMBER = 11,
   OCTOBER = 12,
   NOVEMBER = 13,
   DECEMBER = 14,
   NEXT = 15,
   NEXT_OR_SAME = 16,
   LAST = 17,
   NOW = 18,
   TOMORROW = 19,
   YESTERDAY = 20,
   PAST = 21,
   FUTURE = 22,
   DAY = 23,
   WEEK = 24,
   MONTH = 25,
   YEAR = 26,
   MONDAY = 27,
   TUESDAY = 28,
   WEDNESDAY = 29,
   THURSDAY = 30,
   FRIDAY = 31,
   SATURDAY = 32,
   SUNDAY = 33,
   DAYS = 34,
   WEEKS = 35,
   MONTHS = 36,
   HOURS = 37,
   MINUTES = 38,
   SECONDS = 39,
   YEARS = 40,
   DIGITS = 41,
   SIGNEDDIGITS = 42,
   ZERO = 43,
   ONE = 44,
   TWO = 45,
   THREE = 46,
   FOUR = 47,
   FIVE = 48,
   SIX = 49,
   SEVEN = 50,
   EIGHT = 51,
   NINE = 52,
   TEN = 53,
   ELEVEN = 54,
   TWELVE = 55,
   THIRTEEN = 56,
   FOURTEEN = 57,
   FIFTEEN = 58,
   SIXTEEN = 59,
   SEVENTEEN = 60,
   EIGHTEEN = 61,
   NINETEEN = 62,
   TWENTY = 63,
   THIRTY = 64,
   FORTY = 65,
   FIFTY = 66,
   SIXTY = 67,
   SEVENTY = 68,
   EIGHTY = 69,
   NINETY = 70,
   HUNDRED = 71,
   THOUSAND = 72,
 }

 namespace libtextclassifier2;
 enum DatetimeGroupType : int {
   GROUP_UNKNOWN = 0,
   GROUP_UNUSED = 1,
   GROUP_YEAR = 2,
   GROUP_MONTH = 3,
   GROUP_DAY = 4,
   GROUP_HOUR = 5,
   GROUP_MINUTE = 6,
   GROUP_SECOND = 7,
   GROUP_AMPM = 8,
   GROUP_RELATIONDISTANCE = 9,
   GROUP_RELATION = 10,
   GROUP_RELATIONTYPE = 11,

   // Dummy groups serve just as an inflator of the selection. E.g. we might want
   // to select more text than was contained in an envelope of all extractor
   // spans.
   GROUP_DUMMY1 = 12,

   GROUP_DUMMY2 = 13,
 }

 namespace libtextclassifier2;
 table CompressedBuffer {
   buffer:[ubyte];
   uncompressed_size:int;
 }

 // Options for the model that predicts text selection.
 namespace libtextclassifier2;
 table SelectionModelOptions {
   // If true, before the selection is returned, the unpaired brackets contained
   // in the predicted selection are stripped from the both selection ends.
   // The bracket codepoints are defined in the Unicode standard:
   // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
   strip_unpaired_brackets:bool = 1;

   // Number of hypothetical click positions on either side of the actual click
   // to consider in order to enforce symmetry.
   symmetry_context_size:int;

   // Number of examples to bundle in one batch for inference.
   batch_size:int = 1024;

   // Whether to always classify a suggested selection or only on demand.
   always_classify_suggested_selection:bool = 0;
 }

 // Options for the model that classifies a text selection.
 namespace libtextclassifier2;
 table ClassificationModelOptions {
   // Limits for phone numbers.
   phone_min_num_digits:int = 7;

   phone_max_num_digits:int = 15;

   // Limits for addresses.
   address_min_num_tokens:int;

   // Maximum number of tokens to attempt a classification (-1 is unlimited).
   max_num_tokens:int = -1;
 }

 // List of regular expression matchers to check.
 namespace libtextclassifier2.RegexModel_;
 table Pattern {
   // The name of the collection of a match.
   collection_name:string;

   // The pattern to check.
   // Can specify a single capturing group used as match boundaries.
   pattern:string;

   // The modes for which to apply the patterns.
   enabled_modes:libtextclassifier2.ModeFlag = ALL;

   // The final score to assign to the results of this pattern.
   target_classification_score:float = 1;

   // Priority score used for conflict resolution with the other models.
   priority_score:float = 0;

   // If true, will use an approximate matching implementation implemented
   // using Find() instead of the true Match(). This approximate matching will
   // use the first Find() result and then check that it spans the whole input.
   use_approximate_matching:bool = 0;

   compressed_pattern:libtextclassifier2.CompressedBuffer;
 }

 namespace libtextclassifier2;
 table RegexModel {
   patterns:[libtextclassifier2.RegexModel_.Pattern];
 }

 // List of regex patterns.
 namespace libtextclassifier2.DatetimeModelPattern_;
 table Regex {
   pattern:string;

   // The ith entry specifies the type of the ith capturing group.
   // This is used to decide how the matched content has to be parsed.
   groups:[libtextclassifier2.DatetimeGroupType];

   compressed_pattern:libtextclassifier2.CompressedBuffer;
 }

 namespace libtextclassifier2;
 table DatetimeModelPattern {
   regexes:[libtextclassifier2.DatetimeModelPattern_.Regex];

   // List of locale indices in DatetimeModel that represent the locales that
   // these patterns should be used for. If empty, can be used for all locales.
   locales:[int];

   // The final score to assign to the results of this pattern.
   target_classification_score:float = 1;

   // Priority score used for conflict resulution with the other models.
   priority_score:float = 0;

   // The modes for which to apply the patterns.
   enabled_modes:libtextclassifier2.ModeFlag = ALL;
 }

 namespace libtextclassifier2;
 table DatetimeModelExtractor {
   extractor:libtextclassifier2.DatetimeExtractorType;
   pattern:string;
   locales:[int];
   compressed_pattern:libtextclassifier2.CompressedBuffer;
 }

 namespace libtextclassifier2;
 table DatetimeModel {
   // List of BCP 47 locale strings representing all locales supported by the
   // model. The individual patterns refer back to them using an index.
   locales:[string];

   patterns:[libtextclassifier2.DatetimeModelPattern];
   extractors:[libtextclassifier2.DatetimeModelExtractor];

   // If true, will use the extractors for determining the match location as
   // opposed to using the location where the global pattern matched.
   use_extractors_for_locating:bool = 1;

   // List of locale ids, rules of whose are always run, after the requested
   // ones.
   default_locales:[int];
 }

 namespace libtextclassifier2.DatetimeModelLibrary_;
 table Item {
   key:string;
   value:libtextclassifier2.DatetimeModel;
 }

 // A set of named DateTime models.
 namespace libtextclassifier2;
 table DatetimeModelLibrary {
   models:[libtextclassifier2.DatetimeModelLibrary_.Item];
 }

 // Options controlling the output of the Tensorflow Lite models.
 namespace libtextclassifier2;
 table ModelTriggeringOptions {
   // Lower bound threshold for filtering annotation model outputs.
   min_annotate_confidence:float = 0;

   // The modes for which to enable the models.
   enabled_modes:libtextclassifier2.ModeFlag = ALL;
 }

 // Options controlling the output of the classifier.
 namespace libtextclassifier2;
 table OutputOptions {
   // Lists of collection names that will be filtered out at the output:
   // - For annotation, the spans of given collection are simply dropped.
   // - For classification, the result is mapped to the class "other".
   // - For selection, the spans of given class are returned as
   // single-selection.
   filtered_collections_annotation:[string];

   filtered_collections_classification:[string];
   filtered_collections_selection:[string];
 }

 namespace libtextclassifier2;
 table Model {
   // Comma-separated list of locales supported by the model as BCP 47 tags.
   locales:string;

   version:int;

   // A name for the model that can be used for e.g. logging.
   name:string;

   selection_feature_options:libtextclassifier2.FeatureProcessorOptions;
   classification_feature_options:libtextclassifier2.FeatureProcessorOptions;

   // Tensorflow Lite models.
   selection_model:[ubyte] (force_align: 16);

   classification_model:[ubyte] (force_align: 16);
   embedding_model:[ubyte] (force_align: 16);

   // Options for the different models.
   selection_options:libtextclassifier2.SelectionModelOptions;

   classification_options:libtextclassifier2.ClassificationModelOptions;
   regex_model:libtextclassifier2.RegexModel;
   datetime_model:libtextclassifier2.DatetimeModel;

   // Options controlling the output of the models.
   triggering_options:libtextclassifier2.ModelTriggeringOptions;

   // Global switch that controls if SuggestSelection(), ClassifyText() and
   // Annotate() will run. If a mode is disabled it returns empty/no-op results.
   enabled_modes:libtextclassifier2.ModeFlag = ALL;

   // If true, will snap the selections that consist only of whitespaces to the
   // containing suggested span. Otherwise, no suggestion is proposed, since the
   // selections are not part of any token.
   snap_whitespace_selections:bool = 1;

   // Global configuration for the output of SuggestSelection(), ClassifyText()
   // and Annotate().
   output_options:libtextclassifier2.OutputOptions;
 }

 // Role of the codepoints in the range.
 namespace libtextclassifier2.TokenizationCodepointRange_;
 enum Role : int {
   // Concatenates the codepoint to the current run of codepoints.
   DEFAULT_ROLE = 0,

   // Splits a run of codepoints before the current codepoint.
   SPLIT_BEFORE = 1,

   // Splits a run of codepoints after the current codepoint.
   SPLIT_AFTER = 2,

   // Each codepoint will be a separate token. Good e.g. for Chinese
   // characters.
   TOKEN_SEPARATOR = 3,

   // Discards the codepoint.
   DISCARD_CODEPOINT = 4,

   // Common values:
   // Splits on the characters and discards them. Good e.g. for the space
   // character.
   WHITESPACE_SEPARATOR = 7,
 }

 // Represents a codepoint range [start, end) with its role for tokenization.
 namespace libtextclassifier2;
 table TokenizationCodepointRange {
   start:int;
   end:int;
   role:libtextclassifier2.TokenizationCodepointRange_.Role;

   // Integer identifier of the script this range denotes. Negative values are
   // reserved for Tokenizer's internal use.
   script_id:int;
 }

 // Method for selecting the center token.
 namespace libtextclassifier2.FeatureProcessorOptions_;
 enum CenterTokenSelectionMethod : int {
   DEFAULT_CENTER_TOKEN_METHOD = 0,

   // Use click indices to determine the center token.
   CENTER_TOKEN_FROM_CLICK = 1,

   // Use selection indices to get a token range, and select the middle of it
   // as the center token.
   CENTER_TOKEN_MIDDLE_OF_SELECTION = 2,
 }

 // Controls the type of tokenization the model will use for the input text.
 namespace libtextclassifier2.FeatureProcessorOptions_;
 enum TokenizationType : int {
   INVALID_TOKENIZATION_TYPE = 0,

   // Use the internal tokenizer for tokenization.
   INTERNAL_TOKENIZER = 1,

   // Use ICU for tokenization.
   ICU = 2,

   // First apply ICU tokenization. Then identify stretches of tokens
   // consisting only of codepoints in internal_tokenizer_codepoint_ranges
   // and re-tokenize them using the internal tokenizer.
   MIXED = 3,
 }

 // Range of codepoints start - end, where end is exclusive.
 namespace libtextclassifier2.FeatureProcessorOptions_;
 table CodepointRange {
   start:int;
   end:int;
 }

 // Bounds-sensitive feature extraction configuration.
 namespace libtextclassifier2.FeatureProcessorOptions_;
 table BoundsSensitiveFeatures {
   // Enables the extraction of bounds-sensitive features, instead of the click
   // context features.
   enabled:bool;

   // The numbers of tokens to extract in specific locations relative to the
   // bounds.
   // Immediately before the span.
   num_tokens_before:int;

   // Inside the span, aligned with the beginning.
   num_tokens_inside_left:int;

   // Inside the span, aligned with the end.
   num_tokens_inside_right:int;

   // Immediately after the span.
   num_tokens_after:int;

   // If true, also extracts the tokens of the entire span and adds up their
   // features forming one "token" to include in the extracted features.
   include_inside_bag:bool;

   // If true, includes the selection length (in the number of tokens) as a
   // feature.
   include_inside_length:bool;

   // If true, for selection, single token spans are not run through the model
   // and their score is assumed to be zero.
   score_single_token_spans_as_zero:bool;
 }

 namespace libtextclassifier2.FeatureProcessorOptions_;
 table AlternativeCollectionMapEntry {
   key:string;
   value:string;
 }

 namespace libtextclassifier2;
 table FeatureProcessorOptions {
   // Number of buckets used for hashing charactergrams.
   num_buckets:int = -1;

   // Size of the embedding.
   embedding_size:int = -1;

   // Number of bits for quantization for embeddings.
   embedding_quantization_bits:int = 8;

   // Context size defines the number of words to the left and to the right of
   // the selected word to be used as context. For example, if context size is
   // N, then we take N words to the left and N words to the right of the
   // selected word as its context.
   context_size:int = -1;

   // Maximum number of words of the context to select in total.
   max_selection_span:int = -1;

   // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
   // character trigrams etc.
   chargram_orders:[int];

   // Maximum length of a word, in codepoints.
   max_word_length:int = 20;

   // If true, will use the unicode-aware functionality for extracting features.
   unicode_aware_features:bool = 0;

   // Whether to extract the token case feature.
   extract_case_feature:bool = 0;

   // Whether to extract the selection mask feature.
   extract_selection_mask_feature:bool = 0;

   // List of regexps to run over each token. For each regexp, if there is a
   // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
   regexp_feature:[string];

   // Whether to remap all digits to a single number.
   remap_digits:bool = 0;

   // Whether to lower-case each token before generating hashgrams.
   lowercase_tokens:bool;

   // If true, the selection classifier output will contain only the selections
   // that are feasible (e.g., those that are shorter than max_selection_span),
   // if false, the output will be a complete cross-product of possible
   // selections to the left and possible selections to the right, including the
   // infeasible ones.
   // NOTE: Exists mainly for compatibility with older models that were trained
   // with the non-reduced output space.
   selection_reduced_output_space:bool = 1;

   // Collection names.
   collections:[string];

   // An index of collection in collections to be used if a collection name can't
   // be mapped to an id.
   default_collection:int = -1;

   // If true, will split the input by lines, and only use the line that contains
   // the clicked token.
   only_use_line_with_click:bool = 0;

   // If true, will split tokens that contain the selection boundary, at the
   // position of the boundary.
   // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
   split_tokens_on_selection_boundaries:bool = 0;

   // Codepoint ranges that determine how different codepoints are tokenized.
   // The ranges must not overlap.
   tokenization_codepoint_config:[libtextclassifier2.TokenizationCodepointRange];

   center_token_selection_method:libtextclassifier2.FeatureProcessorOptions_.CenterTokenSelectionMethod;

   // If true, span boundaries will be snapped to containing tokens and not
   // required to exactly match token boundaries.
   snap_label_span_boundaries_to_containing_tokens:bool;

   // A set of codepoint ranges supported by the model.
   supported_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];

   // A set of codepoint ranges to use in the mixed tokenization mode to identify
   // stretches of tokens to re-tokenize using the internal tokenizer.
   internal_tokenizer_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];

   // Minimum ratio of supported codepoints in the input context. If the ratio
   // is lower than this, the feature computation will fail.
   min_supported_codepoint_ratio:float = 0;

   // Used for versioning the format of features the model expects.
   // - feature_version == 0:
   // For each token the features consist of:
   // - chargram embeddings
   // - dense features
   // Chargram embeddings for tokens are concatenated first together,
   // and at the end, the dense features for the tokens are concatenated
   // to it. So the resulting feature vector has two regions.
   feature_version:int = 0;

   tokenization_type:libtextclassifier2.FeatureProcessorOptions_.TokenizationType = INTERNAL_TOKENIZER;
   icu_preserve_whitespace_tokens:bool = 0;

   // List of codepoints that will be stripped from beginning and end of
   // predicted spans.
   ignored_span_boundary_codepoints:[int];

   bounds_sensitive_features:libtextclassifier2.FeatureProcessorOptions_.BoundsSensitiveFeatures;

   // List of allowed charactergrams. The extracted charactergrams are filtered
   // using this list, and charactergrams that are not present are interpreted as
   // out-of-vocabulary.
   // If no allowed_chargrams are specified, all charactergrams are allowed.
   // The field is typed as bytes type to allow non-UTF8 chargrams.
   allowed_chargrams:[string];

   // If true, tokens will be also split when the codepoint's script_id changes
   // as defined in TokenizationCodepointRange.
   tokenize_on_script_change:bool = 0;
 }

 root_type libtextclassifier2.Model;
	//
	// Copyright (C) 2017 The Android Open Source Project
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//

	file_identifier "TC2 ";

	// The possible model modes, represents a bit field.
	namespace libtextclassifier2;
	enum ModeFlag : int {
	NONE = 0,
	ANNOTATION = 1,
	CLASSIFICATION = 2,
	ANNOTATION_AND_CLASSIFICATION = 3,
	SELECTION = 4,
	ANNOTATION_AND_SELECTION = 5,
	CLASSIFICATION_AND_SELECTION = 6,
	ALL = 7,
	}

	namespace libtextclassifier2;
	enum DatetimeExtractorType : int {
	UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0,
	AM = 1,
	PM = 2,
	JANUARY = 3,
	FEBRUARY = 4,
	MARCH = 5,
	APRIL = 6,
	MAY = 7,
	JUNE = 8,
	JULY = 9,
	AUGUST = 10,
	SEPTEMBER = 11,
	OCTOBER = 12,
	NOVEMBER = 13,
	DECEMBER = 14,
	NEXT = 15,
	NEXT_OR_SAME = 16,
	LAST = 17,
	NOW = 18,
	TOMORROW = 19,
	YESTERDAY = 20,
	PAST = 21,
	FUTURE = 22,
	DAY = 23,
	WEEK = 24,
	MONTH = 25,
	YEAR = 26,
	MONDAY = 27,
	TUESDAY = 28,
	WEDNESDAY = 29,
	THURSDAY = 30,
	FRIDAY = 31,
	SATURDAY = 32,
	SUNDAY = 33,
	DAYS = 34,
	WEEKS = 35,
	MONTHS = 36,
	HOURS = 37,
	MINUTES = 38,
	SECONDS = 39,
	YEARS = 40,
	DIGITS = 41,
	SIGNEDDIGITS = 42,
	ZERO = 43,
	ONE = 44,
	TWO = 45,
	THREE = 46,
	FOUR = 47,
	FIVE = 48,
	SIX = 49,
	SEVEN = 50,
	EIGHT = 51,
	NINE = 52,
	TEN = 53,
	ELEVEN = 54,
	TWELVE = 55,
	THIRTEEN = 56,
	FOURTEEN = 57,
	FIFTEEN = 58,
	SIXTEEN = 59,
	SEVENTEEN = 60,
	EIGHTEEN = 61,
	NINETEEN = 62,
	TWENTY = 63,
	THIRTY = 64,
	FORTY = 65,
	FIFTY = 66,
	SIXTY = 67,
	SEVENTY = 68,
	EIGHTY = 69,
	NINETY = 70,
	HUNDRED = 71,
	THOUSAND = 72,
	}

	namespace libtextclassifier2;
	enum DatetimeGroupType : int {
	GROUP_UNKNOWN = 0,
	GROUP_UNUSED = 1,
	GROUP_YEAR = 2,
	GROUP_MONTH = 3,
	GROUP_DAY = 4,
	GROUP_HOUR = 5,
	GROUP_MINUTE = 6,
	GROUP_SECOND = 7,
	GROUP_AMPM = 8,
	GROUP_RELATIONDISTANCE = 9,
	GROUP_RELATION = 10,
	GROUP_RELATIONTYPE = 11,

	// Dummy groups serve just as an inflator of the selection. E.g. we might want
	// to select more text than was contained in an envelope of all extractor
	// spans.
	GROUP_DUMMY1 = 12,

	GROUP_DUMMY2 = 13,
	}

	namespace libtextclassifier2;
	table CompressedBuffer {
	buffer:[ubyte];
	uncompressed_size:int;
	}

	// Options for the model that predicts text selection.
	namespace libtextclassifier2;
	table SelectionModelOptions {
	// If true, before the selection is returned, the unpaired brackets contained
	// in the predicted selection are stripped from the both selection ends.
	// The bracket codepoints are defined in the Unicode standard:
	// http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
	strip_unpaired_brackets:bool = 1;

	// Number of hypothetical click positions on either side of the actual click
	// to consider in order to enforce symmetry.
	symmetry_context_size:int;

	// Number of examples to bundle in one batch for inference.
	batch_size:int = 1024;

	// Whether to always classify a suggested selection or only on demand.
	always_classify_suggested_selection:bool = 0;
	}

	// Options for the model that classifies a text selection.
	namespace libtextclassifier2;
	table ClassificationModelOptions {
	// Limits for phone numbers.
	phone_min_num_digits:int = 7;

	phone_max_num_digits:int = 15;

	// Limits for addresses.
	address_min_num_tokens:int;

	// Maximum number of tokens to attempt a classification (-1 is unlimited).
	max_num_tokens:int = -1;
	}

	// List of regular expression matchers to check.
	namespace libtextclassifier2.RegexModel_;
	table Pattern {
	// The name of the collection of a match.
	collection_name:string;

	// The pattern to check.
	// Can specify a single capturing group used as match boundaries.
	pattern:string;

	// The modes for which to apply the patterns.
	enabled_modes:libtextclassifier2.ModeFlag = ALL;

	// The final score to assign to the results of this pattern.
	target_classification_score:float = 1;

	// Priority score used for conflict resolution with the other models.
	priority_score:float = 0;

	// If true, will use an approximate matching implementation implemented
	// using Find() instead of the true Match(). This approximate matching will
	// use the first Find() result and then check that it spans the whole input.
	use_approximate_matching:bool = 0;

	compressed_pattern:libtextclassifier2.CompressedBuffer;
	}

	namespace libtextclassifier2;
	table RegexModel {
	patterns:[libtextclassifier2.RegexModel_.Pattern];
	}

	// List of regex patterns.
	namespace libtextclassifier2.DatetimeModelPattern_;
	table Regex {
	pattern:string;

	// The ith entry specifies the type of the ith capturing group.
	// This is used to decide how the matched content has to be parsed.
	groups:[libtextclassifier2.DatetimeGroupType];

	compressed_pattern:libtextclassifier2.CompressedBuffer;
	}

	namespace libtextclassifier2;
	table DatetimeModelPattern {
	regexes:[libtextclassifier2.DatetimeModelPattern_.Regex];

	// List of locale indices in DatetimeModel that represent the locales that
	// these patterns should be used for. If empty, can be used for all locales.
	locales:[int];

	// The final score to assign to the results of this pattern.
	target_classification_score:float = 1;

	// Priority score used for conflict resulution with the other models.
	priority_score:float = 0;

	// The modes for which to apply the patterns.
	enabled_modes:libtextclassifier2.ModeFlag = ALL;
	}

	namespace libtextclassifier2;
	table DatetimeModelExtractor {
	extractor:libtextclassifier2.DatetimeExtractorType;
	pattern:string;
	locales:[int];
	compressed_pattern:libtextclassifier2.CompressedBuffer;
	}

	namespace libtextclassifier2;
	table DatetimeModel {
	// List of BCP 47 locale strings representing all locales supported by the
	// model. The individual patterns refer back to them using an index.
	locales:[string];

	patterns:[libtextclassifier2.DatetimeModelPattern];
	extractors:[libtextclassifier2.DatetimeModelExtractor];

	// If true, will use the extractors for determining the match location as
	// opposed to using the location where the global pattern matched.
	use_extractors_for_locating:bool = 1;

	// List of locale ids, rules of whose are always run, after the requested
	// ones.
	default_locales:[int];
	}

	namespace libtextclassifier2.DatetimeModelLibrary_;
	table Item {
	key:string;
	value:libtextclassifier2.DatetimeModel;
	}

	// A set of named DateTime models.
	namespace libtextclassifier2;
	table DatetimeModelLibrary {
	models:[libtextclassifier2.DatetimeModelLibrary_.Item];
	}

	// Options controlling the output of the Tensorflow Lite models.
	namespace libtextclassifier2;
	table ModelTriggeringOptions {
	// Lower bound threshold for filtering annotation model outputs.
	min_annotate_confidence:float = 0;

	// The modes for which to enable the models.
	enabled_modes:libtextclassifier2.ModeFlag = ALL;
	}

	// Options controlling the output of the classifier.
	namespace libtextclassifier2;
	table OutputOptions {
	// Lists of collection names that will be filtered out at the output:
	// - For annotation, the spans of given collection are simply dropped.
	// - For classification, the result is mapped to the class "other".
	// - For selection, the spans of given class are returned as
	// single-selection.
	filtered_collections_annotation:[string];

	filtered_collections_classification:[string];
	filtered_collections_selection:[string];
	}

	namespace libtextclassifier2;
	table Model {
	// Comma-separated list of locales supported by the model as BCP 47 tags.
	locales:string;

	version:int;

	// A name for the model that can be used for e.g. logging.
	name:string;

	selection_feature_options:libtextclassifier2.FeatureProcessorOptions;
	classification_feature_options:libtextclassifier2.FeatureProcessorOptions;

	// Tensorflow Lite models.
	selection_model:[ubyte] (force_align: 16);

	classification_model:[ubyte] (force_align: 16);
	embedding_model:[ubyte] (force_align: 16);

	// Options for the different models.
	selection_options:libtextclassifier2.SelectionModelOptions;

	classification_options:libtextclassifier2.ClassificationModelOptions;
	regex_model:libtextclassifier2.RegexModel;
	datetime_model:libtextclassifier2.DatetimeModel;

	// Options controlling the output of the models.
	triggering_options:libtextclassifier2.ModelTriggeringOptions;

	// Global switch that controls if SuggestSelection(), ClassifyText() and
	// Annotate() will run. If a mode is disabled it returns empty/no-op results.
	enabled_modes:libtextclassifier2.ModeFlag = ALL;

	// If true, will snap the selections that consist only of whitespaces to the
	// containing suggested span. Otherwise, no suggestion is proposed, since the
	// selections are not part of any token.
	snap_whitespace_selections:bool = 1;

	// Global configuration for the output of SuggestSelection(), ClassifyText()
	// and Annotate().
	output_options:libtextclassifier2.OutputOptions;
	}

	// Role of the codepoints in the range.
	namespace libtextclassifier2.TokenizationCodepointRange_;
	enum Role : int {
	// Concatenates the codepoint to the current run of codepoints.
	DEFAULT_ROLE = 0,

	// Splits a run of codepoints before the current codepoint.
	SPLIT_BEFORE = 1,

	// Splits a run of codepoints after the current codepoint.
	SPLIT_AFTER = 2,

	// Each codepoint will be a separate token. Good e.g. for Chinese
	// characters.
	TOKEN_SEPARATOR = 3,

	// Discards the codepoint.
	DISCARD_CODEPOINT = 4,

	// Common values:
	// Splits on the characters and discards them. Good e.g. for the space
	// character.
	WHITESPACE_SEPARATOR = 7,
	}

	// Represents a codepoint range [start, end) with its role for tokenization.
	namespace libtextclassifier2;
	table TokenizationCodepointRange {
	start:int;
	end:int;
	role:libtextclassifier2.TokenizationCodepointRange_.Role;

	// Integer identifier of the script this range denotes. Negative values are
	// reserved for Tokenizer's internal use.
	script_id:int;
	}

	// Method for selecting the center token.
	namespace libtextclassifier2.FeatureProcessorOptions_;
	enum CenterTokenSelectionMethod : int {
	DEFAULT_CENTER_TOKEN_METHOD = 0,

	// Use click indices to determine the center token.
	CENTER_TOKEN_FROM_CLICK = 1,

	// Use selection indices to get a token range, and select the middle of it
	// as the center token.
	CENTER_TOKEN_MIDDLE_OF_SELECTION = 2,
	}

	// Controls the type of tokenization the model will use for the input text.
	namespace libtextclassifier2.FeatureProcessorOptions_;
	enum TokenizationType : int {
	INVALID_TOKENIZATION_TYPE = 0,

	// Use the internal tokenizer for tokenization.
	INTERNAL_TOKENIZER = 1,

	// Use ICU for tokenization.
	ICU = 2,

	// First apply ICU tokenization. Then identify stretches of tokens
	// consisting only of codepoints in internal_tokenizer_codepoint_ranges
	// and re-tokenize them using the internal tokenizer.
	MIXED = 3,
	}

	// Range of codepoints start - end, where end is exclusive.
	namespace libtextclassifier2.FeatureProcessorOptions_;
	table CodepointRange {
	start:int;
	end:int;
	}

	// Bounds-sensitive feature extraction configuration.
	namespace libtextclassifier2.FeatureProcessorOptions_;
	table BoundsSensitiveFeatures {
	// Enables the extraction of bounds-sensitive features, instead of the click
	// context features.
	enabled:bool;

	// The numbers of tokens to extract in specific locations relative to the
	// bounds.
	// Immediately before the span.
	num_tokens_before:int;

	// Inside the span, aligned with the beginning.
	num_tokens_inside_left:int;

	// Inside the span, aligned with the end.
	num_tokens_inside_right:int;

	// Immediately after the span.
	num_tokens_after:int;

	// If true, also extracts the tokens of the entire span and adds up their
	// features forming one "token" to include in the extracted features.
	include_inside_bag:bool;

	// If true, includes the selection length (in the number of tokens) as a
	// feature.
	include_inside_length:bool;

	// If true, for selection, single token spans are not run through the model
	// and their score is assumed to be zero.
	score_single_token_spans_as_zero:bool;
	}

	namespace libtextclassifier2.FeatureProcessorOptions_;
	table AlternativeCollectionMapEntry {
	key:string;
	value:string;
	}

	namespace libtextclassifier2;
	table FeatureProcessorOptions {
	// Number of buckets used for hashing charactergrams.
	num_buckets:int = -1;

	// Size of the embedding.
	embedding_size:int = -1;

	// Number of bits for quantization for embeddings.
	embedding_quantization_bits:int = 8;

	// Context size defines the number of words to the left and to the right of
	// the selected word to be used as context. For example, if context size is
	// N, then we take N words to the left and N words to the right of the
	// selected word as its context.
	context_size:int = -1;

	// Maximum number of words of the context to select in total.
	max_selection_span:int = -1;

	// Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
	// character trigrams etc.
	chargram_orders:[int];

	// Maximum length of a word, in codepoints.
	max_word_length:int = 20;

	// If true, will use the unicode-aware functionality for extracting features.
	unicode_aware_features:bool = 0;

	// Whether to extract the token case feature.
	extract_case_feature:bool = 0;

	// Whether to extract the selection mask feature.
	extract_selection_mask_feature:bool = 0;

	// List of regexps to run over each token. For each regexp, if there is a
	// match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
	regexp_feature:[string];

	// Whether to remap all digits to a single number.
	remap_digits:bool = 0;

	// Whether to lower-case each token before generating hashgrams.
	lowercase_tokens:bool;

	// If true, the selection classifier output will contain only the selections
	// that are feasible (e.g., those that are shorter than max_selection_span),
	// if false, the output will be a complete cross-product of possible
	// selections to the left and possible selections to the right, including the
	// infeasible ones.
	// NOTE: Exists mainly for compatibility with older models that were trained
	// with the non-reduced output space.
	selection_reduced_output_space:bool = 1;

	// Collection names.
	collections:[string];

	// An index of collection in collections to be used if a collection name can't
	// be mapped to an id.
	default_collection:int = -1;

	// If true, will split the input by lines, and only use the line that contains
	// the clicked token.
	only_use_line_with_click:bool = 0;

	// If true, will split tokens that contain the selection boundary, at the
	// position of the boundary.
	// E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
	split_tokens_on_selection_boundaries:bool = 0;

	// Codepoint ranges that determine how different codepoints are tokenized.
	// The ranges must not overlap.
	tokenization_codepoint_config:[libtextclassifier2.TokenizationCodepointRange];

	center_token_selection_method:libtextclassifier2.FeatureProcessorOptions_.CenterTokenSelectionMethod;

	// If true, span boundaries will be snapped to containing tokens and not
	// required to exactly match token boundaries.
	snap_label_span_boundaries_to_containing_tokens:bool;

	// A set of codepoint ranges supported by the model.
	supported_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];

	// A set of codepoint ranges to use in the mixed tokenization mode to identify
	// stretches of tokens to re-tokenize using the internal tokenizer.
	internal_tokenizer_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];

	// Minimum ratio of supported codepoints in the input context. If the ratio
	// is lower than this, the feature computation will fail.
	min_supported_codepoint_ratio:float = 0;

	// Used for versioning the format of features the model expects.
	// - feature_version == 0:
	// For each token the features consist of:
	// - chargram embeddings
	// - dense features
	// Chargram embeddings for tokens are concatenated first together,
	// and at the end, the dense features for the tokens are concatenated
	// to it. So the resulting feature vector has two regions.
	feature_version:int = 0;

	tokenization_type:libtextclassifier2.FeatureProcessorOptions_.TokenizationType = INTERNAL_TOKENIZER;
	icu_preserve_whitespace_tokens:bool = 0;

	// List of codepoints that will be stripped from beginning and end of
	// predicted spans.
	ignored_span_boundary_codepoints:[int];

	bounds_sensitive_features:libtextclassifier2.FeatureProcessorOptions_.BoundsSensitiveFeatures;

	// List of allowed charactergrams. The extracted charactergrams are filtered
	// using this list, and charactergrams that are not present are interpreted as
	// out-of-vocabulary.
	// If no allowed_chargrams are specified, all charactergrams are allowed.
	// The field is typed as bytes type to allow non-UTF8 chargrams.
	allowed_chargrams:[string];

	// If true, tokens will be also split when the codepoint's script_id changes
	// as defined in TokenizationCodepointRange.
	tokenize_on_script_change:bool = 0;
	}

	root_type libtextclassifier2.Model;