model.fbs - platform/external/libtextclassifier - Git at Google

 file_identifier "TC2 ";

 namespace libtextclassifier2;
 enum DatetimeExtractorType : int {
   UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0,
   AM = 1,
   PM = 2,
   JANUARY = 3,
   FEBRUARY = 4,
   MARCH = 5,
   APRIL = 6,
   MAY = 7,
   JUNE = 8,
   JULY = 9,
   AUGUST = 10,
   SEPTEMBER = 11,
   OCTOBER = 12,
   NOVEMBER = 13,
   DECEMBER = 14,
   NEXT = 15,
   NEXT_OR_SAME = 16,
   LAST = 17,
   NOW = 18,
   TOMORROW = 19,
   YESTERDAY = 20,
   PAST = 21,
   FUTURE = 22,
   DAY = 23,
   WEEK = 24,
   MONTH = 25,
   YEAR = 26,
   MONDAY = 27,
   TUESDAY = 28,
   WEDNESDAY = 29,
   THURSDAY = 30,
   FRIDAY = 31,
   SATURDAY = 32,
   SUNDAY = 33,
   DAYS = 34,
   WEEKS = 35,
   MONTHS = 36,
   HOURS = 37,
   MINUTES = 38,
   SECONDS = 39,
   YEARS = 40,
   DIGITS = 41,
   SIGNEDDIGITS = 42,
   ZERO = 43,
   ONE = 44,
   TWO = 45,
   THREE = 46,
   FOUR = 47,
   FIVE = 48,
   SIX = 49,
   SEVEN = 50,
   EIGHT = 51,
   NINE = 52,
   TEN = 53,
   ELEVEN = 54,
   TWELVE = 55,
   THIRTEEN = 56,
   FOURTEEN = 57,
   FIFTEEN = 58,
   SIXTEEN = 59,
   SEVENTEEN = 60,
   EIGHTEEN = 61,
   NINETEEN = 62,
   TWENTY = 63,
   THIRTY = 64,
   FORTY = 65,
   FIFTY = 66,
   SIXTY = 67,
   SEVENTY = 68,
   EIGHTY = 69,
   NINETY = 70,
   HUNDRED = 71,
   THOUSAND = 72,
 }

 // Options for the model that predicts text selection.
 namespace libtextclassifier2;
 table SelectionModelOptions {
   // If true, before the selection is returned, the unpaired brackets contained
   // in the predicted selection are stripped from the both selection ends.
   // The bracket codepoints are defined in the Unicode standard:
   // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
   strip_unpaired_brackets:bool = 1;

   // Number of hypothetical click positions on either side of the actual click
   // to consider in order to enforce symmetry.
   symmetry_context_size:int;

   // Number of examples to bundle in one batch for inference.
   batch_size:int = 1024;
 }

 // Options for the model that classifies a text selection.
 namespace libtextclassifier2;
 table ClassificationModelOptions {
   // Limits for phone numbers.
   phone_min_num_digits:int = 7;

   phone_max_num_digits:int = 15;
 }

 // List of regular expression matchers to check.
 namespace libtextclassifier2.RegexModel_;
 table Pattern {
   // The name of the collection of a match.
   collection_name:string;

   // The pattern to check.
   // Can specify a single capturing group used as match boundaries.
   pattern:string;

   // Whether to apply the pattern for annotation.
   enabled_for_annotation:bool = 0;

   // Whether to apply the pattern for classification.
   enabled_for_classification:bool = 0;

   // Whether to apply the pattern for selection.
   enabled_for_selection:bool = 0;

   // The final score to assign to the results of this pattern.
   target_classification_score:float = 1;

   // Priority score used for conflict resulution with the other models.
   priority_score:float = 0;
 }

 namespace libtextclassifier2;
 table RegexModel {
   patterns:[libtextclassifier2.RegexModel_.Pattern];
 }

 namespace libtextclassifier2;
 table DatetimeModelPattern {
   // List of regex patterns.
   regexes:[string];

   // List of locale indices in DatetimeModel that represent the locales that
   // these patterns should be used for. If empty, can be used for all locales.
   locales:[int];

   // The final score to assign to the results of this pattern.
   target_classification_score:float = 1;

   // Priority score used for conflict resulution with the other models.
   priority_score:float = 0;
 }

 namespace libtextclassifier2;
 table DatetimeModelExtractor {
   extractor:libtextclassifier2.DatetimeExtractorType;
   pattern:string;
   locales:[int];
 }

 namespace libtextclassifier2;
 table DatetimeModel {
   // List of BCP 47 locale strings representing all locales supported by the
   // model. The individual patterns refer back to them using an index.
   locales:[string];

   patterns:[libtextclassifier2.DatetimeModelPattern];
   extractors:[libtextclassifier2.DatetimeModelExtractor];
 }

 // Options controlling the output of the models.
 namespace libtextclassifier2;
 table ModelTriggeringOptions {
   // Lower bound threshold for filtering annotation model outputs.
   min_annotate_confidence:float = 0;
 }

 namespace libtextclassifier2;
 table Model {
   // Comma-separated list of locales supported by the model as BCP 47 tags.
   locales:string;

   version:int;
   selection_feature_options:libtextclassifier2.FeatureProcessorOptions;
   classification_feature_options:libtextclassifier2.FeatureProcessorOptions;

   // TFLite models.
   selection_model:[ubyte] (force_align: 16);

   classification_model:[ubyte] (force_align: 16);
   embedding_model:[ubyte] (force_align: 16);
   regex_model:libtextclassifier2.RegexModel;

   // Options for the different models.
   selection_options:libtextclassifier2.SelectionModelOptions;

   classification_options:libtextclassifier2.ClassificationModelOptions;
   datetime_model:libtextclassifier2.DatetimeModel;

   // Options controlling the output of the models.
   triggering_options:libtextclassifier2.ModelTriggeringOptions;
 }

 // Role of the codepoints in the range.
 namespace libtextclassifier2.TokenizationCodepointRange_;
 enum Role : int {
   // Concatenates the codepoint to the current run of codepoints.
   DEFAULT_ROLE = 0,

   // Splits a run of codepoints before the current codepoint.
   SPLIT_BEFORE = 1,

   // Splits a run of codepoints after the current codepoint.
   SPLIT_AFTER = 2,

   // Each codepoint will be a separate token. Good e.g. for Chinese
   // characters.
   TOKEN_SEPARATOR = 3,

   // Discards the codepoint.
   DISCARD_CODEPOINT = 4,

   // Common values:
   // Splits on the characters and discards them. Good e.g. for the space
   // character.
   WHITESPACE_SEPARATOR = 7,
 }

 // Represents a codepoint range [start, end) with its role for tokenization.
 namespace libtextclassifier2;
 table TokenizationCodepointRange {
   start:int;
   end:int;
   role:libtextclassifier2.TokenizationCodepointRange_.Role;

   // Integer identifier of the script this range denotes. Negative values are
   // reserved for Tokenizer's internal use.
   script_id:int;
 }

 // Method for selecting the center token.
 namespace libtextclassifier2.FeatureProcessorOptions_;
 enum CenterTokenSelectionMethod : int {
   DEFAULT_CENTER_TOKEN_METHOD = 0,

   // Use click indices to determine the center token.
   CENTER_TOKEN_FROM_CLICK = 1,

   // Use selection indices to get a token range, and select the middle of it
   // as the center token.
   CENTER_TOKEN_MIDDLE_OF_SELECTION = 2,
 }

 // Controls the type of tokenization the model will use for the input text.
 namespace libtextclassifier2.FeatureProcessorOptions_;
 enum TokenizationType : int {
   INVALID_TOKENIZATION_TYPE = 0,

   // Use the internal tokenizer for tokenization.
   INTERNAL_TOKENIZER = 1,

   // Use ICU for tokenization.
   ICU = 2,

   // First apply ICU tokenization. Then identify stretches of tokens
   // consisting only of codepoints in internal_tokenizer_codepoint_ranges
   // and re-tokenize them using the internal tokenizer.
   MIXED = 3,
 }

 // Range of codepoints start - end, where end is exclusive.
 namespace libtextclassifier2.FeatureProcessorOptions_;
 table CodepointRange {
   start:int;
   end:int;
 }

 // Bounds-sensitive feature extraction configuration go/tc-bounds-sensitive.
 namespace libtextclassifier2.FeatureProcessorOptions_;
 table BoundsSensitiveFeatures {
   // Enables the extraction of bounds-sensitive features, instead of the click
   // context features.
   enabled:bool;

   // The numbers of tokens to extract in specific locations relative to the
   // bounds.
   // Immediately before the span.
   num_tokens_before:int;

   // Inside the span, aligned with the beginning.
   num_tokens_inside_left:int;

   // Inside the span, aligned with the end.
   num_tokens_inside_right:int;

   // Immediately after the span.
   num_tokens_after:int;

   // If true, also extracts the tokens of the entire span and adds up their
   // features forming one "token" to include in the extracted features.
   include_inside_bag:bool;

   // If true, includes the selection length (in the number of tokens) as a
   // feature.
   include_inside_length:bool;
 }

 namespace libtextclassifier2.FeatureProcessorOptions_;
 table AlternativeCollectionMapEntry {
   key:string;
   value:string;
 }

 // TC_STRIP
 // Next ID: 44
 // TC_END_STRIP
 namespace libtextclassifier2;
 table FeatureProcessorOptions {
   // Number of buckets used for hashing charactergrams.
   num_buckets:int = -1;

   // Size of the embedding.
   embedding_size:int = -1;

   // Context size defines the number of words to the left and to the right of
   // the selected word to be used as context. For example, if context size is
   // N, then we take N words to the left and N words to the right of the
   // selected word as its context.
   context_size:int = -1;

   // Maximum number of words of the context to select in total.
   max_selection_span:int = -1;

   // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
   // character trigrams etc.
   chargram_orders:[int];

   // Maximum length of a word, in codepoints.
   max_word_length:int = 20;

   // If true, will use the unicode-aware functionality for extracting features.
   unicode_aware_features:bool = 0;

   // Whether to extract the token case feature.
   extract_case_feature:bool = 0;

   // Whether to extract the selection mask feature.
   extract_selection_mask_feature:bool = 0;

   // List of regexps to run over each token. For each regexp, if there is a
   // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
   regexp_feature:[string];

   // Whether to remap all digits to a single number.
   remap_digits:bool = 0;

   // Whether to lower-case each token before generating hashgrams.
   lowercase_tokens:bool;

   // If true, the selection classifier output will contain only the selections
   // that are feasible (e.g., those that are shorter than max_selection_span),
   // if false, the output will be a complete cross-product of possible
   // selections to the left and posible selections to the right, including the
   // infeasible ones.
   // NOTE: Exists mainly for compatibility with older models that were trained
   // with the non-reduced output space.
   selection_reduced_output_space:bool = 1;

   // Collection names.
   collections:[string];

   // An index of collection in collections to be used if a collection name can't
   // be mapped to an id.
   default_collection:int = -1;

   // If true, will split the input by lines, and only use the line that contains
   // the clicked token.
   only_use_line_with_click:bool = 0;

   // If true, will split tokens that contain the selection boundary, at the
   // position of the boundary.
   // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
   split_tokens_on_selection_boundaries:bool = 0;

   // Codepoint ranges that determine how different codepoints are tokenized.
   // The ranges must not overlap.
   tokenization_codepoint_config:[libtextclassifier2.TokenizationCodepointRange];

   center_token_selection_method:libtextclassifier2.FeatureProcessorOptions_.CenterTokenSelectionMethod;

   // If true, span boundaries will be snapped to containing tokens and not
   // required to exactly match token boundaries.
   snap_label_span_boundaries_to_containing_tokens:bool;

   // A set of codepoint ranges supported by the model.
   supported_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];

   // A set of codepoint ranges to use in the mixed tokenization mode to identify
   // stretches of tokens to re-tokenize using the internal tokenizer.
   internal_tokenizer_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];

   // Minimum ratio of supported codepoints in the input context. If the ratio
   // is lower than this, the feature computation will fail.
   min_supported_codepoint_ratio:float = 0;

   // Used for versioning the format of features the model expects.
   // - feature_version == 0:
   // For each token the features consist of:
   // - chargram embeddings
   // - dense features
   // Chargram embeddings for tokens are concatenated first together,
   // and at the end, the dense features for the tokens are concatenated
   // to it. So the resulting feature vector has two regions.
   feature_version:int = 0;

   tokenization_type:libtextclassifier2.FeatureProcessorOptions_.TokenizationType;
   icu_preserve_whitespace_tokens:bool = 0;

   // List of codepoints that will be stripped from beginning and end of
   // predicted spans.
   ignored_span_boundary_codepoints:[int];

   bounds_sensitive_features:libtextclassifier2.FeatureProcessorOptions_.BoundsSensitiveFeatures;

   // List of allowed charactergrams. The extracted charactergrams are filtered
   // using this list, and charactergrams that are not present are interpreted as
   // out-of-vocabulary.
   // If no allowed_chargrams are specified, all charactergrams are allowed.
   // The field is typed as bytes type to allow non-UTF8 chargrams.
   allowed_chargrams:[string];

   // If true, tokens will be also split when the codepoint's script_id changes
   // as defined in TokenizationCodepointRange.
   tokenize_on_script_change:bool = 0;

   // Number of bits for quantization for embeddings.
   embedding_quantization_bits:int = 8;
 }

 root_type libtextclassifier2.Model;
	file_identifier "TC2 ";

	namespace libtextclassifier2;
	enum DatetimeExtractorType : int {
	UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0,
	AM = 1,
	PM = 2,
	JANUARY = 3,
	FEBRUARY = 4,
	MARCH = 5,
	APRIL = 6,
	MAY = 7,
	JUNE = 8,
	JULY = 9,
	AUGUST = 10,
	SEPTEMBER = 11,
	OCTOBER = 12,
	NOVEMBER = 13,
	DECEMBER = 14,
	NEXT = 15,
	NEXT_OR_SAME = 16,
	LAST = 17,
	NOW = 18,
	TOMORROW = 19,
	YESTERDAY = 20,
	PAST = 21,
	FUTURE = 22,
	DAY = 23,
	WEEK = 24,
	MONTH = 25,
	YEAR = 26,
	MONDAY = 27,
	TUESDAY = 28,
	WEDNESDAY = 29,
	THURSDAY = 30,
	FRIDAY = 31,
	SATURDAY = 32,
	SUNDAY = 33,
	DAYS = 34,
	WEEKS = 35,
	MONTHS = 36,
	HOURS = 37,
	MINUTES = 38,
	SECONDS = 39,
	YEARS = 40,
	DIGITS = 41,
	SIGNEDDIGITS = 42,
	ZERO = 43,
	ONE = 44,
	TWO = 45,
	THREE = 46,
	FOUR = 47,
	FIVE = 48,
	SIX = 49,
	SEVEN = 50,
	EIGHT = 51,
	NINE = 52,
	TEN = 53,
	ELEVEN = 54,
	TWELVE = 55,
	THIRTEEN = 56,
	FOURTEEN = 57,
	FIFTEEN = 58,
	SIXTEEN = 59,
	SEVENTEEN = 60,
	EIGHTEEN = 61,
	NINETEEN = 62,
	TWENTY = 63,
	THIRTY = 64,
	FORTY = 65,
	FIFTY = 66,
	SIXTY = 67,
	SEVENTY = 68,
	EIGHTY = 69,
	NINETY = 70,
	HUNDRED = 71,
	THOUSAND = 72,
	}

	// Options for the model that predicts text selection.
	namespace libtextclassifier2;
	table SelectionModelOptions {
	// If true, before the selection is returned, the unpaired brackets contained
	// in the predicted selection are stripped from the both selection ends.
	// The bracket codepoints are defined in the Unicode standard:
	// http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
	strip_unpaired_brackets:bool = 1;

	// Number of hypothetical click positions on either side of the actual click
	// to consider in order to enforce symmetry.
	symmetry_context_size:int;

	// Number of examples to bundle in one batch for inference.
	batch_size:int = 1024;
	}

	// Options for the model that classifies a text selection.
	namespace libtextclassifier2;
	table ClassificationModelOptions {
	// Limits for phone numbers.
	phone_min_num_digits:int = 7;

	phone_max_num_digits:int = 15;
	}

	// List of regular expression matchers to check.
	namespace libtextclassifier2.RegexModel_;
	table Pattern {
	// The name of the collection of a match.
	collection_name:string;

	// The pattern to check.
	// Can specify a single capturing group used as match boundaries.
	pattern:string;

	// Whether to apply the pattern for annotation.
	enabled_for_annotation:bool = 0;

	// Whether to apply the pattern for classification.
	enabled_for_classification:bool = 0;

	// Whether to apply the pattern for selection.
	enabled_for_selection:bool = 0;

	// The final score to assign to the results of this pattern.
	target_classification_score:float = 1;

	// Priority score used for conflict resulution with the other models.
	priority_score:float = 0;
	}

	namespace libtextclassifier2;
	table RegexModel {
	patterns:[libtextclassifier2.RegexModel_.Pattern];
	}

	namespace libtextclassifier2;
	table DatetimeModelPattern {
	// List of regex patterns.
	regexes:[string];

	// List of locale indices in DatetimeModel that represent the locales that
	// these patterns should be used for. If empty, can be used for all locales.
	locales:[int];

	// The final score to assign to the results of this pattern.
	target_classification_score:float = 1;

	// Priority score used for conflict resulution with the other models.
	priority_score:float = 0;
	}

	namespace libtextclassifier2;
	table DatetimeModelExtractor {
	extractor:libtextclassifier2.DatetimeExtractorType;
	pattern:string;
	locales:[int];
	}

	namespace libtextclassifier2;
	table DatetimeModel {
	// List of BCP 47 locale strings representing all locales supported by the
	// model. The individual patterns refer back to them using an index.
	locales:[string];

	patterns:[libtextclassifier2.DatetimeModelPattern];
	extractors:[libtextclassifier2.DatetimeModelExtractor];
	}

	// Options controlling the output of the models.
	namespace libtextclassifier2;
	table ModelTriggeringOptions {
	// Lower bound threshold for filtering annotation model outputs.
	min_annotate_confidence:float = 0;
	}

	namespace libtextclassifier2;
	table Model {
	// Comma-separated list of locales supported by the model as BCP 47 tags.
	locales:string;

	version:int;
	selection_feature_options:libtextclassifier2.FeatureProcessorOptions;
	classification_feature_options:libtextclassifier2.FeatureProcessorOptions;

	// TFLite models.
	selection_model:[ubyte] (force_align: 16);

	classification_model:[ubyte] (force_align: 16);
	embedding_model:[ubyte] (force_align: 16);
	regex_model:libtextclassifier2.RegexModel;

	// Options for the different models.
	selection_options:libtextclassifier2.SelectionModelOptions;

	classification_options:libtextclassifier2.ClassificationModelOptions;
	datetime_model:libtextclassifier2.DatetimeModel;

	// Options controlling the output of the models.
	triggering_options:libtextclassifier2.ModelTriggeringOptions;
	}

	// Role of the codepoints in the range.
	namespace libtextclassifier2.TokenizationCodepointRange_;
	enum Role : int {
	// Concatenates the codepoint to the current run of codepoints.
	DEFAULT_ROLE = 0,

	// Splits a run of codepoints before the current codepoint.
	SPLIT_BEFORE = 1,

	// Splits a run of codepoints after the current codepoint.
	SPLIT_AFTER = 2,

	// Each codepoint will be a separate token. Good e.g. for Chinese
	// characters.
	TOKEN_SEPARATOR = 3,

	// Discards the codepoint.
	DISCARD_CODEPOINT = 4,

	// Common values:
	// Splits on the characters and discards them. Good e.g. for the space
	// character.
	WHITESPACE_SEPARATOR = 7,
	}

	// Represents a codepoint range [start, end) with its role for tokenization.
	namespace libtextclassifier2;
	table TokenizationCodepointRange {
	start:int;
	end:int;
	role:libtextclassifier2.TokenizationCodepointRange_.Role;

	// Integer identifier of the script this range denotes. Negative values are
	// reserved for Tokenizer's internal use.
	script_id:int;
	}

	// Method for selecting the center token.
	namespace libtextclassifier2.FeatureProcessorOptions_;
	enum CenterTokenSelectionMethod : int {
	DEFAULT_CENTER_TOKEN_METHOD = 0,

	// Use click indices to determine the center token.
	CENTER_TOKEN_FROM_CLICK = 1,

	// Use selection indices to get a token range, and select the middle of it
	// as the center token.
	CENTER_TOKEN_MIDDLE_OF_SELECTION = 2,
	}

	// Controls the type of tokenization the model will use for the input text.
	namespace libtextclassifier2.FeatureProcessorOptions_;
	enum TokenizationType : int {
	INVALID_TOKENIZATION_TYPE = 0,

	// Use the internal tokenizer for tokenization.
	INTERNAL_TOKENIZER = 1,

	// Use ICU for tokenization.
	ICU = 2,

	// First apply ICU tokenization. Then identify stretches of tokens
	// consisting only of codepoints in internal_tokenizer_codepoint_ranges
	// and re-tokenize them using the internal tokenizer.
	MIXED = 3,
	}

	// Range of codepoints start - end, where end is exclusive.
	namespace libtextclassifier2.FeatureProcessorOptions_;
	table CodepointRange {
	start:int;
	end:int;
	}

	// Bounds-sensitive feature extraction configuration go/tc-bounds-sensitive.
	namespace libtextclassifier2.FeatureProcessorOptions_;
	table BoundsSensitiveFeatures {
	// Enables the extraction of bounds-sensitive features, instead of the click
	// context features.
	enabled:bool;

	// The numbers of tokens to extract in specific locations relative to the
	// bounds.
	// Immediately before the span.
	num_tokens_before:int;

	// Inside the span, aligned with the beginning.
	num_tokens_inside_left:int;

	// Inside the span, aligned with the end.
	num_tokens_inside_right:int;

	// Immediately after the span.
	num_tokens_after:int;

	// If true, also extracts the tokens of the entire span and adds up their
	// features forming one "token" to include in the extracted features.
	include_inside_bag:bool;

	// If true, includes the selection length (in the number of tokens) as a
	// feature.
	include_inside_length:bool;
	}

	namespace libtextclassifier2.FeatureProcessorOptions_;
	table AlternativeCollectionMapEntry {
	key:string;
	value:string;
	}

	// TC_STRIP
	// Next ID: 44
	// TC_END_STRIP
	namespace libtextclassifier2;
	table FeatureProcessorOptions {
	// Number of buckets used for hashing charactergrams.
	num_buckets:int = -1;

	// Size of the embedding.
	embedding_size:int = -1;

	// Context size defines the number of words to the left and to the right of
	// the selected word to be used as context. For example, if context size is
	// N, then we take N words to the left and N words to the right of the
	// selected word as its context.
	context_size:int = -1;

	// Maximum number of words of the context to select in total.
	max_selection_span:int = -1;

	// Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
	// character trigrams etc.
	chargram_orders:[int];

	// Maximum length of a word, in codepoints.
	max_word_length:int = 20;

	// If true, will use the unicode-aware functionality for extracting features.
	unicode_aware_features:bool = 0;

	// Whether to extract the token case feature.
	extract_case_feature:bool = 0;

	// Whether to extract the selection mask feature.
	extract_selection_mask_feature:bool = 0;

	// List of regexps to run over each token. For each regexp, if there is a
	// match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
	regexp_feature:[string];

	// Whether to remap all digits to a single number.
	remap_digits:bool = 0;

	// Whether to lower-case each token before generating hashgrams.
	lowercase_tokens:bool;

	// If true, the selection classifier output will contain only the selections
	// that are feasible (e.g., those that are shorter than max_selection_span),
	// if false, the output will be a complete cross-product of possible
	// selections to the left and posible selections to the right, including the
	// infeasible ones.
	// NOTE: Exists mainly for compatibility with older models that were trained
	// with the non-reduced output space.
	selection_reduced_output_space:bool = 1;

	// Collection names.
	collections:[string];

	// An index of collection in collections to be used if a collection name can't
	// be mapped to an id.
	default_collection:int = -1;

	// If true, will split the input by lines, and only use the line that contains
	// the clicked token.
	only_use_line_with_click:bool = 0;

	// If true, will split tokens that contain the selection boundary, at the
	// position of the boundary.
	// E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
	split_tokens_on_selection_boundaries:bool = 0;

	// Codepoint ranges that determine how different codepoints are tokenized.
	// The ranges must not overlap.
	tokenization_codepoint_config:[libtextclassifier2.TokenizationCodepointRange];

	center_token_selection_method:libtextclassifier2.FeatureProcessorOptions_.CenterTokenSelectionMethod;

	// If true, span boundaries will be snapped to containing tokens and not
	// required to exactly match token boundaries.
	snap_label_span_boundaries_to_containing_tokens:bool;

	// A set of codepoint ranges supported by the model.
	supported_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];

	// A set of codepoint ranges to use in the mixed tokenization mode to identify
	// stretches of tokens to re-tokenize using the internal tokenizer.
	internal_tokenizer_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];

	// Minimum ratio of supported codepoints in the input context. If the ratio
	// is lower than this, the feature computation will fail.
	min_supported_codepoint_ratio:float = 0;

	// Used for versioning the format of features the model expects.
	// - feature_version == 0:
	// For each token the features consist of:
	// - chargram embeddings
	// - dense features
	// Chargram embeddings for tokens are concatenated first together,
	// and at the end, the dense features for the tokens are concatenated
	// to it. So the resulting feature vector has two regions.
	feature_version:int = 0;

	tokenization_type:libtextclassifier2.FeatureProcessorOptions_.TokenizationType;
	icu_preserve_whitespace_tokens:bool = 0;

	// List of codepoints that will be stripped from beginning and end of
	// predicted spans.
	ignored_span_boundary_codepoints:[int];

	bounds_sensitive_features:libtextclassifier2.FeatureProcessorOptions_.BoundsSensitiveFeatures;

	// List of allowed charactergrams. The extracted charactergrams are filtered
	// using this list, and charactergrams that are not present are interpreted as
	// out-of-vocabulary.
	// If no allowed_chargrams are specified, all charactergrams are allowed.
	// The field is typed as bytes type to allow non-UTF8 chargrams.
	allowed_chargrams:[string];

	// If true, tokens will be also split when the codepoint's script_id changes
	// as defined in TokenizationCodepointRange.
	tokenize_on_script_change:bool = 0;

	// Number of bits for quantization for embeddings.
	embedding_quantization_bits:int = 8;
	}

	root_type libtextclassifier2.Model;