blob: 632703c765fda11edb385a6cda32aadd5fef5530 [file] [log] [blame]
file_identifier "TC2 ";
namespace libtextclassifier2;
enum DatetimeExtractorType : int {
UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0,
AM = 1,
PM = 2,
JANUARY = 3,
FEBRUARY = 4,
MARCH = 5,
APRIL = 6,
MAY = 7,
JUNE = 8,
JULY = 9,
AUGUST = 10,
SEPTEMBER = 11,
OCTOBER = 12,
NOVEMBER = 13,
DECEMBER = 14,
NEXT = 15,
NEXT_OR_SAME = 16,
LAST = 17,
NOW = 18,
TOMORROW = 19,
YESTERDAY = 20,
PAST = 21,
FUTURE = 22,
DAY = 23,
WEEK = 24,
MONTH = 25,
YEAR = 26,
MONDAY = 27,
TUESDAY = 28,
WEDNESDAY = 29,
THURSDAY = 30,
FRIDAY = 31,
SATURDAY = 32,
SUNDAY = 33,
DAYS = 34,
WEEKS = 35,
MONTHS = 36,
HOURS = 37,
MINUTES = 38,
SECONDS = 39,
YEARS = 40,
DIGITS = 41,
SIGNEDDIGITS = 42,
ZERO = 43,
ONE = 44,
TWO = 45,
THREE = 46,
FOUR = 47,
FIVE = 48,
SIX = 49,
SEVEN = 50,
EIGHT = 51,
NINE = 52,
TEN = 53,
ELEVEN = 54,
TWELVE = 55,
THIRTEEN = 56,
FOURTEEN = 57,
FIFTEEN = 58,
SIXTEEN = 59,
SEVENTEEN = 60,
EIGHTEEN = 61,
NINETEEN = 62,
TWENTY = 63,
THIRTY = 64,
FORTY = 65,
FIFTY = 66,
SIXTY = 67,
SEVENTY = 68,
EIGHTY = 69,
NINETY = 70,
HUNDRED = 71,
THOUSAND = 72,
}
// Options for the model that predicts text selection.
namespace libtextclassifier2;
table SelectionModelOptions {
// If true, before the selection is returned, the unpaired brackets contained
// in the predicted selection are stripped from the both selection ends.
// The bracket codepoints are defined in the Unicode standard:
// http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
strip_unpaired_brackets:bool = 1;
// Number of hypothetical click positions on either side of the actual click
// to consider in order to enforce symmetry.
symmetry_context_size:int;
// Number of examples to bundle in one batch for inference.
batch_size:int = 1024;
}
// Options for the model that classifies a text selection.
namespace libtextclassifier2;
table ClassificationModelOptions {
// Limits for phone numbers.
phone_min_num_digits:int = 7;
phone_max_num_digits:int = 15;
}
// List of regular expression matchers to check.
namespace libtextclassifier2.RegexModel_;
table Pattern {
// The name of the collection of a match.
collection_name:string;
// The pattern to check.
// Can specify a single capturing group used as match boundaries.
pattern:string;
// Whether to apply the pattern for annotation.
enabled_for_annotation:bool = 0;
// Whether to apply the pattern for classification.
enabled_for_classification:bool = 0;
// Whether to apply the pattern for selection.
enabled_for_selection:bool = 0;
// The final score to assign to the results of this pattern.
target_classification_score:float = 1;
// Priority score used for conflict resulution with the other models.
priority_score:float = 0;
}
namespace libtextclassifier2;
table RegexModel {
patterns:[libtextclassifier2.RegexModel_.Pattern];
}
namespace libtextclassifier2;
table DatetimeModelPattern {
// List of regex patterns.
regexes:[string];
// List of locale indices in DatetimeModel that represent the locales that
// these patterns should be used for. If empty, can be used for all locales.
locales:[int];
// The final score to assign to the results of this pattern.
target_classification_score:float = 1;
// Priority score used for conflict resulution with the other models.
priority_score:float = 0;
}
namespace libtextclassifier2;
table DatetimeModelExtractor {
extractor:libtextclassifier2.DatetimeExtractorType;
pattern:string;
locales:[int];
}
namespace libtextclassifier2;
table DatetimeModel {
// List of BCP 47 locale strings representing all locales supported by the
// model. The individual patterns refer back to them using an index.
locales:[string];
patterns:[libtextclassifier2.DatetimeModelPattern];
extractors:[libtextclassifier2.DatetimeModelExtractor];
}
// Options controlling the output of the models.
namespace libtextclassifier2;
table ModelTriggeringOptions {
// Lower bound threshold for filtering annotation model outputs.
min_annotate_confidence:float = 0;
}
namespace libtextclassifier2;
table Model {
// Comma-separated list of locales supported by the model as BCP 47 tags.
locales:string;
version:int;
selection_feature_options:libtextclassifier2.FeatureProcessorOptions;
classification_feature_options:libtextclassifier2.FeatureProcessorOptions;
// TFLite models.
selection_model:[ubyte] (force_align: 16);
classification_model:[ubyte] (force_align: 16);
embedding_model:[ubyte] (force_align: 16);
regex_model:libtextclassifier2.RegexModel;
// Options for the different models.
selection_options:libtextclassifier2.SelectionModelOptions;
classification_options:libtextclassifier2.ClassificationModelOptions;
datetime_model:libtextclassifier2.DatetimeModel;
// Options controlling the output of the models.
triggering_options:libtextclassifier2.ModelTriggeringOptions;
}
// Role of the codepoints in the range.
namespace libtextclassifier2.TokenizationCodepointRange_;
enum Role : int {
// Concatenates the codepoint to the current run of codepoints.
DEFAULT_ROLE = 0,
// Splits a run of codepoints before the current codepoint.
SPLIT_BEFORE = 1,
// Splits a run of codepoints after the current codepoint.
SPLIT_AFTER = 2,
// Each codepoint will be a separate token. Good e.g. for Chinese
// characters.
TOKEN_SEPARATOR = 3,
// Discards the codepoint.
DISCARD_CODEPOINT = 4,
// Common values:
// Splits on the characters and discards them. Good e.g. for the space
// character.
WHITESPACE_SEPARATOR = 7,
}
// Represents a codepoint range [start, end) with its role for tokenization.
namespace libtextclassifier2;
table TokenizationCodepointRange {
start:int;
end:int;
role:libtextclassifier2.TokenizationCodepointRange_.Role;
// Integer identifier of the script this range denotes. Negative values are
// reserved for Tokenizer's internal use.
script_id:int;
}
// Method for selecting the center token.
namespace libtextclassifier2.FeatureProcessorOptions_;
enum CenterTokenSelectionMethod : int {
DEFAULT_CENTER_TOKEN_METHOD = 0,
// Use click indices to determine the center token.
CENTER_TOKEN_FROM_CLICK = 1,
// Use selection indices to get a token range, and select the middle of it
// as the center token.
CENTER_TOKEN_MIDDLE_OF_SELECTION = 2,
}
// Controls the type of tokenization the model will use for the input text.
namespace libtextclassifier2.FeatureProcessorOptions_;
enum TokenizationType : int {
INVALID_TOKENIZATION_TYPE = 0,
// Use the internal tokenizer for tokenization.
INTERNAL_TOKENIZER = 1,
// Use ICU for tokenization.
ICU = 2,
// First apply ICU tokenization. Then identify stretches of tokens
// consisting only of codepoints in internal_tokenizer_codepoint_ranges
// and re-tokenize them using the internal tokenizer.
MIXED = 3,
}
// Range of codepoints start - end, where end is exclusive.
namespace libtextclassifier2.FeatureProcessorOptions_;
table CodepointRange {
start:int;
end:int;
}
// Bounds-sensitive feature extraction configuration go/tc-bounds-sensitive.
namespace libtextclassifier2.FeatureProcessorOptions_;
table BoundsSensitiveFeatures {
// Enables the extraction of bounds-sensitive features, instead of the click
// context features.
enabled:bool;
// The numbers of tokens to extract in specific locations relative to the
// bounds.
// Immediately before the span.
num_tokens_before:int;
// Inside the span, aligned with the beginning.
num_tokens_inside_left:int;
// Inside the span, aligned with the end.
num_tokens_inside_right:int;
// Immediately after the span.
num_tokens_after:int;
// If true, also extracts the tokens of the entire span and adds up their
// features forming one "token" to include in the extracted features.
include_inside_bag:bool;
// If true, includes the selection length (in the number of tokens) as a
// feature.
include_inside_length:bool;
}
namespace libtextclassifier2.FeatureProcessorOptions_;
table AlternativeCollectionMapEntry {
key:string;
value:string;
}
// TC_STRIP
// Next ID: 44
// TC_END_STRIP
namespace libtextclassifier2;
table FeatureProcessorOptions {
// Number of buckets used for hashing charactergrams.
num_buckets:int = -1;
// Size of the embedding.
embedding_size:int = -1;
// Context size defines the number of words to the left and to the right of
// the selected word to be used as context. For example, if context size is
// N, then we take N words to the left and N words to the right of the
// selected word as its context.
context_size:int = -1;
// Maximum number of words of the context to select in total.
max_selection_span:int = -1;
// Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
// character trigrams etc.
chargram_orders:[int];
// Maximum length of a word, in codepoints.
max_word_length:int = 20;
// If true, will use the unicode-aware functionality for extracting features.
unicode_aware_features:bool = 0;
// Whether to extract the token case feature.
extract_case_feature:bool = 0;
// Whether to extract the selection mask feature.
extract_selection_mask_feature:bool = 0;
// List of regexps to run over each token. For each regexp, if there is a
// match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
regexp_feature:[string];
// Whether to remap all digits to a single number.
remap_digits:bool = 0;
// Whether to lower-case each token before generating hashgrams.
lowercase_tokens:bool;
// If true, the selection classifier output will contain only the selections
// that are feasible (e.g., those that are shorter than max_selection_span),
// if false, the output will be a complete cross-product of possible
// selections to the left and posible selections to the right, including the
// infeasible ones.
// NOTE: Exists mainly for compatibility with older models that were trained
// with the non-reduced output space.
selection_reduced_output_space:bool = 1;
// Collection names.
collections:[string];
// An index of collection in collections to be used if a collection name can't
// be mapped to an id.
default_collection:int = -1;
// If true, will split the input by lines, and only use the line that contains
// the clicked token.
only_use_line_with_click:bool = 0;
// If true, will split tokens that contain the selection boundary, at the
// position of the boundary.
// E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
split_tokens_on_selection_boundaries:bool = 0;
// Codepoint ranges that determine how different codepoints are tokenized.
// The ranges must not overlap.
tokenization_codepoint_config:[libtextclassifier2.TokenizationCodepointRange];
center_token_selection_method:libtextclassifier2.FeatureProcessorOptions_.CenterTokenSelectionMethod;
// If true, span boundaries will be snapped to containing tokens and not
// required to exactly match token boundaries.
snap_label_span_boundaries_to_containing_tokens:bool;
// A set of codepoint ranges supported by the model.
supported_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];
// A set of codepoint ranges to use in the mixed tokenization mode to identify
// stretches of tokens to re-tokenize using the internal tokenizer.
internal_tokenizer_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];
// Minimum ratio of supported codepoints in the input context. If the ratio
// is lower than this, the feature computation will fail.
min_supported_codepoint_ratio:float = 0;
// Used for versioning the format of features the model expects.
// - feature_version == 0:
// For each token the features consist of:
// - chargram embeddings
// - dense features
// Chargram embeddings for tokens are concatenated first together,
// and at the end, the dense features for the tokens are concatenated
// to it. So the resulting feature vector has two regions.
feature_version:int = 0;
tokenization_type:libtextclassifier2.FeatureProcessorOptions_.TokenizationType;
icu_preserve_whitespace_tokens:bool = 0;
// List of codepoints that will be stripped from beginning and end of
// predicted spans.
ignored_span_boundary_codepoints:[int];
bounds_sensitive_features:libtextclassifier2.FeatureProcessorOptions_.BoundsSensitiveFeatures;
// List of allowed charactergrams. The extracted charactergrams are filtered
// using this list, and charactergrams that are not present are interpreted as
// out-of-vocabulary.
// If no allowed_chargrams are specified, all charactergrams are allowed.
// The field is typed as bytes type to allow non-UTF8 chargrams.
allowed_chargrams:[string];
// If true, tokens will be also split when the codepoint's script_id changes
// as defined in TokenizationCodepointRange.
tokenize_on_script_change:bool = 0;
// Number of bits for quantization for embeddings.
embedding_quantization_bits:int = 8;
}
root_type libtextclassifier2.Model;