blob: 08b14d2c0635795184963991a0067adaaac370e8 [file] [log] [blame]
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keras index lookup preprocessing layer."""
# pylint: disable=g-classes-have-attributes
import collections
import json
import operator
import numpy as np
from tensorflow.python.eager import context
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.framework import tensor_shape
from tensorflow.python.framework import tensor_spec
from tensorflow.python.framework import tensor_util
from tensorflow.python.keras import backend
from tensorflow.python.keras.engine import base_preprocessing_layer
from tensorflow.python.keras.layers.preprocessing import category_encoding
from tensorflow.python.keras.layers.preprocessing import table_utils
from tensorflow.python.keras.saving.saved_model import layer_serialization
from tensorflow.python.keras.utils import layer_utils
from tensorflow.python.keras.utils import tf_utils
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import init_ops
from tensorflow.python.ops import lookup_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import sparse_ops
from tensorflow.python.ops import string_ops
from tensorflow.python.platform import gfile
from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.util import compat
INT = "int"
MULTI_HOT = "multi_hot"
ONE_HOT = "one_hot"
COUNT = "count"
TF_IDF = "tf_idf"
_VOCAB_NAME = "vocab"
_IDF_WEIGHTS_NAME = "idf_weights"
class _NullInitializer(lookup_ops.TextFileInitializer):
"""A placeholder initializer for restoring this layer from a SavedModel."""
def __init__(self, key_dtype, value_dtype):
"""Construct a table initializer object.
Args:
key_dtype: Type of the table keys.
value_dtype: Type of the table values.
"""
self._key_dtype = dtypes.as_dtype(key_dtype)
self._value_dtype = dtypes.as_dtype(value_dtype)
@property
def key_dtype(self):
"""The expected table key dtype."""
return self._key_dtype
@property
def value_dtype(self):
"""The expected table value dtype."""
return self._value_dtype
def initialize(self, table):
"""Returns the table initialization op."""
pass
@property
def _shared_name(self):
"""Returns a shared name to be used by the table."""
shared_name = "NULL_INITIALIZER_"
if context.executing_eagerly():
# Ensure a unique name when eager execution is enabled to avoid spurious
# sharing issues..
shared_name += str(backend.get_uid(shared_name))
return shared_name
class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
"""Maps values from a vocabulary to integer indices.
This layer translates a set of arbitrary hashables into an integer output via
a table-based lookup, with optional out-of-vocabulary handling. This is the
basis layer for both IntegerLookup and StringLookup; it holds the common
logic but is not intended to be exported as part of the Keras API.
Args:
max_tokens: The maximum size of the vocabulary for this layer. If None,
there is no cap on the size of the vocabulary. Note that this size
includes the OOV and mask tokens.
num_oov_indices: The number of out-of-vocabulary tokens to use. If this
value is more than 1, OOV inputs are hashed to determine their OOV value.
If this value is 0, OOV inputs will cause an error when calling the layer.
mask_token: A token that represents masked inputs. When `output_mode` is
`"int"`, the token is included in vocabulary and mapped to index 0. In
other output modes, the token will not appear in the vocabulary and
instances of the mask token in the input will be dropped. If set to None,
no mask term will be added.
oov_token: Only used when `invert` is True. The token to return for OOV
indices.
vocabulary: An optional list of vocabulary terms. If the list contains the
same token multiple times, an error will be thrown.
invert: Only valid when `output_mode` is `"int"`. If True, this layer will
map indices to vocabulary items instead of mapping vocabulary items to
indices. Default to False.
output_mode: Specification for the output of the layer. Defaults to `"int"`.
Values can be `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or
`"tf_idf"` configuring the layer as follows:
- `"int"`: Return the raw integer indices of the input tokens.
- `"one_hot"`: Encodes each individual element in the input into an
array the same size as the vocabulary, containing a 1 at the element
index. If the last dimension is size 1, will encode on that dimension.
If the last dimension is not size 1, will append a new dimension for
the encoded output.
- `"multi_hot"`: Encodes each sample in the input into a single array
the same size as the vocabulary, containing a 1 for each vocabulary
term present in the sample. Treats the last dimension as the sample
dimension, if input shape is (..., sample_length), output shape will
be (..., num_tokens).
- `"count"`: As `"multi_hot"`, but the int array contains a count of the
number of times the token at that index appeared in the sample.
- `"tf_idf"`: As `"multi_hot"`, but the TF-IDF algorithm is applied to
find the value in each token slot.
pad_to_max_tokens: Only valid when `output_mode` is `"multi_hot"`,
`"count"`, or `"tf_idf"`. If True, the output will have its feature axis
padded to `max_tokens` even if the number of unique tokens in the
vocabulary is less than max_tokens, resulting in a tensor of shape
[batch_size, max_tokens] regardless of vocabulary size. Defaults to False.
sparse: Boolean. Only applicable to `"multi_hot"` and `"count"` output
modes. If True, returns a `SparseTensor` instead of a dense `Tensor`.
Defaults to False.
"""
def __init__(self,
max_tokens,
num_oov_indices,
mask_token,
oov_token,
vocabulary=None,
invert=False,
output_mode=INT,
sparse=False,
pad_to_max_tokens=False,
**kwargs):
# If max_tokens is set, the value must be greater than 1 - otherwise we
# are creating a 0-element vocab, which doesn't make sense.
if max_tokens is not None and max_tokens <= 1:
raise ValueError("If set, `max_tokens` must be greater than 1. "
"You passed {}".format(max_tokens))
if num_oov_indices < 0:
raise ValueError("`num_oov_indices` must be greater than or equal to 0. "
"You passed {}".format(num_oov_indices))
# Support deprecated names for output_modes.
if output_mode == "binary":
output_mode = MULTI_HOT
if output_mode == "tf-idf":
output_mode = TF_IDF
# 'output_mode' must be one of (INT, ONE_HOT, MULTI_HOT, COUNT, TF_IDF)
layer_utils.validate_string_arg(
output_mode,
allowable_strings=(INT, ONE_HOT, MULTI_HOT, COUNT, TF_IDF),
layer_name=self.__class__.__name__,
arg_name="output_mode")
if invert and output_mode != INT:
raise ValueError("`output_mode` must be {} when `invert` is true. You "
"passed {}".format(INT, output_mode))
self.invert = invert
self.max_tokens = max_tokens
self.num_oov_indices = num_oov_indices
self.output_mode = output_mode
self.sparse = sparse
self.pad_to_max_tokens = pad_to_max_tokens
self._called = False
# A note on vocab_size: we need to always keep a non-Tensor representation
# of vocab_size around to use in graph building. Because we might be
# in a tf.function, we can't rely on evaluating the actual tables to
# find the value either.
self._vocab_size = None
# We need to keep track our current vocab size outside of our layer weights
# to support a static output shape when `output_mode != INT`. The bincount
# ops do not set shape on their outputs, which means we have to set it
# ourselves. We persist the current vocab size as a hidden part of the
# config when serializing our model.
if "vocabulary_size" in kwargs:
self._vocab_size = kwargs["vocabulary_size"]
del kwargs["vocabulary_size"]
restore_from_static_table = kwargs.pop("has_static_table", False)
# Make sure the mask token and oov token are truly of the dtype we want. We
# can ignore strings here, because they have only one dtype.
dtype = kwargs["dtype"]
if dtype == dtypes.int32:
mask_token = None if mask_token is None else np.int32(mask_token)
oov_token = None if oov_token is None else np.int32(oov_token)
elif dtype == dtypes.int64:
mask_token = None if mask_token is None else np.int64(mask_token)
oov_token = None if oov_token is None else np.int64(oov_token)
self.mask_token = mask_token
self.oov_token = oov_token
if max_tokens is not None:
available_vocab_size = max_tokens - self._token_start_index()
else:
available_vocab_size = None
super(IndexLookup, self).__init__(
combiner=_IndexLookupCombiner(
vocab_size=available_vocab_size,
mask_value=mask_token,
oov_value=oov_token,
compute_idf=(output_mode == TF_IDF)),
**kwargs)
# We need to save the key dtype so that we know if we're expecting int64
# keys. If we are, we will cast int32 inputs to int64 as well.
if invert:
self._key_dtype = dtypes.int64
self._value_dtype = self.dtype
self._mask_key = 0
self._mask_value = mask_token
key_index = lookup_ops.TextFileIndex.LINE_NUMBER
value_index = lookup_ops.TextFileIndex.WHOLE_LINE
default_value = self.oov_token
oov_indices = None
else:
self._key_dtype = self.dtype
self._value_dtype = dtypes.int64
self._mask_key = mask_token
key_index = lookup_ops.TextFileIndex.WHOLE_LINE
value_index = lookup_ops.TextFileIndex.LINE_NUMBER
# Masks should map to 0 for int output and be dropped otherwise. Max ints
# will be dropped from the bincount op.
self._mask_value = 0 if self.output_mode == INT else dtypes.int64.max
oov_start = self._oov_start_index()
token_start = self._token_start_index()
if self.num_oov_indices == 0:
# If there are no OOV indices, we map OOV tokens to -1 and error out
# during call if we find a negative index.
default_value = -1
oov_indices = None
elif self.num_oov_indices == 1:
# If there is only one OOV index, we can set that index as the default
# value of the index_lookup table.
default_value = oov_start
oov_indices = None
else:
# If we hav multiple OOV values, we need to do a further hashing step;
# to make this easier, we set the OOV value to -1. (This lets us do a
# vectorized add and cast to boolean to determine locations where we
# need to do extra hashing.)
default_value = -1
oov_indices = list(range(oov_start, token_start))
self._static_vocabulary_path = None
has_vocab_path = (vocabulary is not None and isinstance(vocabulary, str))
if has_vocab_path or restore_from_static_table:
self._has_static_table = True
if vocabulary is None:
# If we're restoring a layer that was saved with a static table
# initializer, we create a fake initializer object to let the code
# progress. The savedmodel restoration code will handle restoring
# the actual data.
initializer = _NullInitializer(self._key_dtype, self._value_dtype)
else:
if not gfile.Exists(vocabulary):
raise ValueError("Vocabulary file %s does not exist." % (vocabulary,))
self._static_vocabulary_path = vocabulary
num_tokens = table_utils.num_tokens_in_file(vocabulary)
self._vocab_size = self._token_start_index() + num_tokens
initializer = lookup_ops.TextFileInitializer(
filename=vocabulary,
key_dtype=self._key_dtype,
key_index=key_index,
value_dtype=self._value_dtype,
value_index=value_index,
value_index_offset=self._token_start_index())
self._table = lookup_ops.StaticHashTable(
initializer, default_value=default_value)
self._table_handler = table_utils.TableHandler(
table=self._table,
mask_token=self._mask_key if self.mask_token is not None else None,
mask_value=self._mask_value,
oov_tokens=oov_indices)
tracked_table = self._add_trackable(self._table, trainable=False)
else:
self._has_static_table = False
self._table = lookup_ops.MutableHashTable(
key_dtype=self._key_dtype,
value_dtype=self._value_dtype,
default_value=default_value,
name=(self._name + "_index_table"))
self._table_handler = table_utils.TableHandler(
table=self._table,
oov_tokens=oov_indices)
if vocabulary is not None:
self.set_vocabulary(vocabulary)
tracked_table = self._add_trackable(self._table, trainable=False)
if self.output_mode == TF_IDF:
# The TF-IDF weight may have a (None,) tensorshape. This creates
# a 1D variable with arbitrary shape, which we can assign any weight to
# so long as it has 1 dimension. In order to properly initialize this
# weight in Keras, we need to provide a custom callable initializer which
# does not depend on the shape of the weight (as all other initializers
# do) since the weight is not known. Hence the lambda shape, dtype: [0].
if not self.pad_to_max_tokens or max_tokens is None:
initializer = lambda shape, dtype: [0]
else:
initializer = init_ops.zeros_initializer
# We are adding these here instead of in build() since they do not depend
# on the input shape at all.
idf_shape = (max_tokens,) if self.pad_to_max_tokens else (None,)
self.tf_idf_weights = self._add_state_variable(
name="idf",
shape=tensor_shape.TensorShape(idf_shape),
dtype=backend.floatx(),
initializer=initializer)
# This is a workaround for summary() on this layer. Because the table is
# not mutable during training, the effective number of parameters (and so
# the weight shape) is 0; we add this as an attr so that the parameter
# counting code in the Model object doesn't throw an attribute error.
tracked_table.shape = tensor_shape.TensorShape((0,))
def compute_output_shape(self, input_shape):
if self.output_mode == INT:
return input_shape
if self._vocab_size and not self.pad_to_max_tokens:
out_depth = self._vocab_size
else:
out_depth = self.max_tokens
return tensor_shape.TensorShape([input_shape[0], out_depth])
def compute_output_signature(self, input_spec):
output_shape = self.compute_output_shape(input_spec.shape.as_list())
output_dtype = (self._value_dtype if self.output_mode == INT
else backend.floatx())
return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
def adapt(self, data, reset_state=True):
"""Fits the state of the preprocessing layer to the dataset.
Overrides the default adapt method to apply relevant preprocessing to the
inputs before passing to the combiner.
Args:
data: The data to train on. It can be passed either as a tf.data Dataset,
or as a numpy array.
reset_state: Optional argument specifying whether to clear the state of
the layer at the start of the call to `adapt`. This must be True for
this layer, which does not support repeated calls to `adapt`.
"""
if not reset_state:
raise ValueError("IndexLookup does not support streaming adapts.")
super(IndexLookup, self).adapt(data, reset_state)
def get_vocabulary(self, include_special_tokens=True):
"""Returns the current vocabulary of the layer.
Args:
include_special_tokens: If True, the returned vocabulary will include mask
and OOV tokens, and a term's index in the vocabulary will equal the
term's index when calling the layer. If False, the returned vocabulary
will not include any mask or OOV tokens.
"""
if self.vocabulary_size() is None:
return []
# The MutableHashTable data will not be sorted, so we will create a inverted
# lookup here, and use that to lookup a range of indices [0, vocab_size).
keys, values = self._table.export()
vocab, indices = (values, keys) if self.invert else (keys, values)
lookup = collections.defaultdict(
lambda: self.oov_token,
zip(indices.numpy(), self._tensor_vocab_to_numpy(vocab)))
vocab = [lookup[x] for x in range(self.vocabulary_size())]
if self.mask_token is not None and self.output_mode == INT:
vocab[0] = self.mask_token
if not include_special_tokens:
vocab = vocab[self._token_start_index():]
return vocab
def vocabulary_size(self):
"""Gets the current size of the layer's vocabulary.
Returns:
The integer size of the voculary, including optional mask and oov indices.
"""
return self._vocab_size
def vocab_size(self):
logging.warning("vocab_size is deprecated, please use vocabulary_size.")
return self.vocabulary_size()
def get_config(self):
if self._has_static_table:
vocabulary_path = self._static_vocabulary_path
else:
vocabulary_path = None
config = {
"invert": self.invert,
"max_tokens": self.max_tokens,
"num_oov_indices": self.num_oov_indices,
"oov_token": self.oov_token,
"mask_token": self.mask_token,
"output_mode": self.output_mode,
"pad_to_max_tokens": self.pad_to_max_tokens,
"vocabulary_size": self.vocabulary_size(),
"vocabulary": vocabulary_path,
}
if self._has_static_table:
config["has_static_table"] = True
base_config = super(IndexLookup, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def count_params(self):
# This method counts the number of scalars in the weights of this layer.
# Since this layer doesn't have any /actual/ weights (in that there's
# nothing in this layer that can be trained - we only use the weight
# abstraction for ease of saving!) we return 0.
return 0
def set_vocabulary(self, vocabulary, idf_weights=None):
"""Sets vocabulary (and optionally document frequency) data for this layer.
This method sets the vocabulary and idf weights for this layer directly,
instead of analyzing a dataset through `adapt`. It should be used whenever
the vocab (and optionally document frequency) information is already known.
If vocabulary data is already present in the layer, this method will replace
it.
Args:
vocabulary: An array, numpy array, or tensor of hashable tokens.
idf_weights: An array, numpy array, or tensor of inverse document
frequency weights with equal length to vocab. Only necessary if the
layer output_mode is TF_IDF.
Raises:
ValueError: If there are too many inputs, the inputs do not match, or
input data is missing.
RuntimeError: If the vocabulary cannot be set when this function is
called. This happens when `"multi_hot"`, `"count"`, and `"tfidf"` modes,
if `pad_to_max_tokens` is False and the layer itself has already been
called.
RuntimeError: If a tensor vocabulary is passed outside of eager execution.
"""
if self._has_static_table:
raise RuntimeError("Layer {} was created with a static file-based table "
"because a file path was passed to the layer "
"init. Layers created with static file-based tables "
"do not support changing the vocabulary after "
"creation.".format(self.name))
if self.output_mode != TF_IDF and idf_weights is not None:
raise ValueError("`idf_weights` should only be set if output_mode is "
"TF_IDF. output_mode is {}.".format(self.output_mode))
if (self.output_mode in [MULTI_HOT, COUNT, TF_IDF] and self._called and
not self.pad_to_max_tokens):
raise RuntimeError("When using {} mode and `pad_to_max_tokens` is "
"False, the vocabulary cannot be changed after the "
"layer is called.".format(self.output_mode))
if not context.executing_eagerly() and (tensor_util.is_tensor(vocabulary) or
tensor_util.is_tensor(idf_weights)):
raise RuntimeError(
"Cannot set a tensor vocabulary on {} layer {} when not executing "
"eagerly. Create this layer or call `set_vocabulary` outside of "
"any `tf.function`s and with eager execution enabled.".format(
self.__class__.__name__, self.name))
# TODO(mattdangerw): for better performance we should rewrite this entire
# function to operate on tensors and convert vocabulary to a tensor here.
if tensor_util.is_tensor(vocabulary):
vocabulary = self._tensor_vocab_to_numpy(vocabulary)
if tensor_util.is_tensor(idf_weights):
idf_weights = idf_weights.numpy()
oov_start = self._oov_start_index()
token_start = self._token_start_index()
should_have_mask = (oov_start > 0)
has_mask = should_have_mask and vocabulary[0] == self.mask_token
should_have_oov = (self.num_oov_indices > 0)
expected_oov = [self.oov_token] * self.num_oov_indices
found_oov = vocabulary[oov_start:token_start]
has_oov = should_have_oov and found_oov == expected_oov
# If we get a numpy array, then has_oov may end up being a numpy array
# instead of a bool. Fix this by collapsing the variable if it's not bool.
if not isinstance(has_oov, bool):
has_oov = any(has_oov)
if all([should_have_mask, has_mask, should_have_oov]) and not has_oov:
raise ValueError(
"Invalid vocabulary format. The layer was created with "
"`mask_token={mask}` and `oov_token={oov}`. These tokens should be "
"included in the provided vocabulary. The passed vocabulary has the "
"correct mask token `{mask}` at index 0, but does not have the OOV "
"token `{oov}` in indices [{start}:{end}]. Instead, we found "
"`{found}`. Was this vocabulary generated by a layer with "
"incompatible settings?".format(
mask=self.mask_token,
oov=self.oov_token,
start=oov_start,
end=token_start,
found=found_oov))
if all([should_have_oov, has_oov, should_have_mask]) and not has_mask:
raise ValueError(
"Invalid vocabulary format. The layer was created with "
"`mask_token={mask}` and `oov_token={oov}`. These tokens should be "
"included in the provided vocabulary. The passed vocabulary has the "
"correct OOV token `{oov}` at indices [{start}:{end}], but does not "
"have the mask token `{mask}` in index 0. Instead, we found "
"`{found}`. Was this vocabulary generated by a layer with "
"incompatible settings?".format(
mask=self.mask_token,
oov=self.oov_token,
start=oov_start,
end=token_start,
found=vocabulary[0]))
found_special_tokens = has_oov or has_mask
if found_special_tokens:
tokens = vocabulary[token_start:]
else:
tokens = vocabulary
repeated_tokens = table_utils.find_repeated_tokens(tokens)
if repeated_tokens:
raise ValueError("The passed vocabulary has at least one repeated "
"term. Please uniquify your dataset. The repeated terms "
"are {}".format(repeated_tokens))
if self.mask_token in tokens:
raise ValueError("Reserved mask token {} was found in the passed "
"vocabulary at index {}. Please either remove the "
"reserved token from the vocabulary or change the "
"mask token for this layer.".format(
self.mask_token, tokens.index(self.mask_token)))
if self.oov_token in tokens:
raise ValueError("Reserved OOV token {} was found in the passed "
"vocabulary at index {}. Please either remove the "
"reserved token from the vocabulary or change the "
"OOV token for this layer.".format(
self.oov_token, tokens.index(self.oov_token)))
self._vocab_size = token_start + len(tokens)
if self.max_tokens is not None and self._vocab_size > self.max_tokens:
raise ValueError(
"Attempted to set a vocabulary larger than the maximum vocab size. "
"Passed vocab size is {}, max vocab size is {}.".format(
self._vocab_size, self.max_tokens))
if self.output_mode == TF_IDF:
if idf_weights is None:
raise ValueError("`idf_weights` must be set if output_mode is TF_IDF")
if len(vocabulary) != len(idf_weights):
raise ValueError("`idf_weights` must be the same length as vocabulary. "
"len(idf_weights) is {}, len(vocabulary) is {}".format(
len(vocabulary), len(idf_weights)))
idf_weights = self._convert_to_ndarray(idf_weights)
if idf_weights.ndim != 1:
raise ValueError(
"TF-IDF data must be a 1-index array, but received {}".format(
type(idf_weights)))
# We add the non-special vocab tokens and optionally the mask_token to our
# hash table. OOV tokens are handled with the hash table default value and
# not added directly.
self._table_handler.clear()
indices = np.arange(token_start, len(tokens) + token_start, dtype=np.int64)
if self.invert:
self._table_handler.insert(indices, tokens)
else:
self._table_handler.insert(tokens, indices)
if self.mask_token is not None:
self._table_handler.insert([self._mask_key], [self._mask_value])
if self.output_mode == TF_IDF:
# If the passed vocabulary has no special tokens, we need to pad the front
# of idf_weights. We don't have real document frequencies for these tokens
# so we will use an average of all idf_weights passed in as a reasonable
# default.
if found_special_tokens:
front_padding = 0
front_padding_value = 0
else:
front_padding = token_start
front_padding_value = np.average(idf_weights)
# If pad_to_max_tokens is true, and max_tokens is greater than our total
# vocab size, we need to pad the back of idf_weights with zeros as well.
back_padding_value = 0
if self.pad_to_max_tokens and self.max_tokens is not None:
back_padding = self.max_tokens - front_padding - len(idf_weights)
else:
back_padding = 0
idf_weights = np.pad(
idf_weights, (front_padding, back_padding),
"constant",
constant_values=(front_padding_value, back_padding_value))
backend.set_value(self.tf_idf_weights, idf_weights)
def _set_state_variables(self, updates):
if not self.built:
raise RuntimeError("_set_state_variables() must be called after build().")
self.set_vocabulary(
updates[_VOCAB_NAME], idf_weights=updates[_IDF_WEIGHTS_NAME])
def call(self, inputs):
if isinstance(inputs, (list, tuple, np.ndarray)):
inputs = ops.convert_to_tensor_v2_with_dispatch(inputs)
if not self.max_tokens and self._vocab_size is None:
raise ValueError("You must set the layer's vocabulary before calling it. "
"Either pass a `vocabulary` argument to the layer, or "
"call `layer.adapt(dataset)` with some sample data.")
self._called = True
if self._key_dtype == dtypes.int64 and inputs.dtype == dtypes.int32:
inputs = math_ops.cast(inputs, dtypes.int64)
lookup_result = self._table_handler.lookup(inputs)
lookup_checks = []
if self.num_oov_indices == 0 and not self.invert:
if tf_utils.is_sparse(inputs):
lookup_values = lookup_result.values
input_values = inputs.values
elif tf_utils.is_ragged(inputs):
lookup_values = lookup_result.flat_values
input_values = inputs.flat_values
else:
lookup_values = lookup_result
input_values = inputs
oov_indices = array_ops.where_v2(math_ops.equal(lookup_values, -1))
oov_inputs = array_ops.gather_nd(input_values, oov_indices)
msg = string_ops.string_format(
"When `num_oov_indices=0` all inputs should be in vocabulary, "
"found OOV values {}, consider setting `num_oov_indices=1`.",
(oov_inputs,))
assertion = control_flow_ops.Assert(
math_ops.equal(array_ops.size(oov_indices), 0), [msg])
lookup_checks.append(assertion)
with ops.control_dependencies(lookup_checks):
if self.output_mode == INT:
return array_ops.identity(lookup_result)
else:
return self._encode_output(lookup_result)
def _encode_output(self, lookup_result):
def expand_dims(inputs, axis):
if tf_utils.is_sparse(inputs):
return sparse_ops.sparse_expand_dims(inputs, axis)
else:
return array_ops.expand_dims(inputs, axis)
original_shape = lookup_result.shape
# In all cases, we should uprank scalar input to a single sample.
if lookup_result.shape.rank == 0:
lookup_result = expand_dims(lookup_result, -1)
# One hot will unprank only if the final output dimension is not already 1.
if self.output_mode == ONE_HOT:
if lookup_result.shape[-1] != 1:
lookup_result = expand_dims(lookup_result, -1)
# TODO(b/190445202): remove output rank restriction.
if lookup_result.shape.rank > 2:
raise ValueError(
"Received input shape {}, which would result in output rank {}. "
"Currently only outputs up to rank 2 are supported for "
"`output_mode={}`.".format(original_shape, lookup_result.shape.rank,
self.output_mode))
binary_output = self.output_mode in (MULTI_HOT, ONE_HOT)
if self._vocab_size and not self.pad_to_max_tokens:
out_depth = self._vocab_size
else:
out_depth = self.max_tokens
if self.sparse:
bincounts = category_encoding.sparse_bincount(lookup_result, out_depth,
binary_output)
else:
bincounts = category_encoding.dense_bincount(lookup_result, out_depth,
binary_output)
if self.output_mode == TF_IDF:
return math_ops.multiply(bincounts, self.tf_idf_weights)
return bincounts
def _convert_to_ndarray(self, x):
return np.array(x) if isinstance(x, (list, tuple)) else x
def _oov_start_index(self):
return 1 if self.mask_token is not None and self.output_mode == INT else 0
def _token_start_index(self):
return self._oov_start_index() + self.num_oov_indices
@property
def _trackable_saved_model_saver(self):
return layer_serialization.IndexLookupLayerSavedModelSaver(self)
# Override points for IntegerLookup and StringLookup.
def _tensor_vocab_to_numpy(self, vocabulary):
"""Converts a tensor vocabulary to a numpy vocabulary."""
return vocabulary.numpy()
class _IndexLookupAccumulator(
collections.namedtuple("Accumulator",
["data", "count_dict", "per_doc_count_dict"])):
pass
class _IndexLookupCombiner(base_preprocessing_layer.Combiner):
"""Combiner for the IndexLookup preprocessing layer.
This class encapsulates the logic for computing a vocabulary based on the
frequency of each token.
Attributes:
vocab_size: (Optional) If set, only the top `vocab_size` tokens (based on
frequency across the dataset) are retained in the vocabulary. If None, or
set to a value greater than the total number of distinct tokens in the
dataset, all tokens are retained.
"""
def __init__(self,
vocab_size=None,
mask_value=None,
oov_value=None,
compute_idf=False):
self._vocab_size = vocab_size
self._mask_value = mask_value
self._oov_value = oov_value
self._compute_idf = compute_idf
def compute(self, values, accumulator=None):
"""Compute a step in this computation, returning a new accumulator."""
values = base_preprocessing_layer.convert_to_list(
values, sparse_default_value=self._mask_value)
if accumulator is None:
accumulator = self._create_accumulator()
# TODO(momernick): Benchmark improvements to this algorithm.
if not isinstance(values, list):
values = [values]
for document in values:
if not isinstance(document, list):
document = [document]
if self._compute_idf:
current_doc_id = accumulator.data["next_doc_id"]
accumulator.data["next_doc_id"] += 1
for token in document:
accumulator.count_dict[token] += 1
if self._compute_idf:
doc_count = accumulator.per_doc_count_dict[token]
if doc_count["last_doc_id"] != current_doc_id:
doc_count["count"] += 1
doc_count["last_doc_id"] = current_doc_id
return accumulator
def merge(self, accumulators):
"""Merge several accumulators to a single accumulator."""
if not accumulators:
return accumulators
base_accumulator = accumulators[0]
for accumulator in accumulators[1:]:
for token, value in accumulator.count_dict.items():
base_accumulator.count_dict[token] += value
if self._compute_idf:
base_accumulator.data["next_doc_id"] += accumulator.data["next_doc_id"]
if self._compute_idf:
for token, value in accumulator.per_doc_count_dict.items():
# Any newly created token counts in 'base_accumulator''s
# per_doc_count_dict will have a last_doc_id of -1. This is always
# less than the next doc id (which are strictly positive), so any
# future occurrences are guaranteed to be counted.
base_accumulator.per_doc_count_dict[token]["count"] += value[
"count"]
return base_accumulator
def extract(self, accumulator):
"""Convert an accumulator into a dict of output values.
Args:
accumulator: An accumulator aggregating over the full dataset.
Returns:
A dict of:
"vocab": A list of the retained items in the vocabulary.
"""
vocab_counts = accumulator.count_dict
# Drop special tokens from our vocab.
if self._mask_value in vocab_counts:
del vocab_counts[self._mask_value]
if self._oov_value in vocab_counts:
del vocab_counts[self._oov_value]
# Data processed by the accumulator could be tensors, numpy arrays or lists.
# For tensor string input, values will have been converted into bytes. We
# need to check the bytes version of special tokens in this case.
if isinstance(self._mask_value, str):
mask_value_bytes = compat.as_bytes(self._mask_value)
if mask_value_bytes in vocab_counts:
del vocab_counts[mask_value_bytes]
if isinstance(self._oov_value, str):
oov_value_bytes = compat.as_bytes(self._oov_value)
if oov_value_bytes in vocab_counts:
del vocab_counts[oov_value_bytes]
sorted_counts = sorted(
vocab_counts.items(), key=operator.itemgetter(1, 0), reverse=True)
vocab_data = (
sorted_counts[:self._vocab_size] if self._vocab_size else sorted_counts)
vocab = [data[0] for data in vocab_data]
if self._compute_idf:
num_documents = accumulator.data["next_doc_id"]
document_counts = accumulator.per_doc_count_dict
doc_counts = [document_counts[token]["count"] for token in vocab]
idf_weights = self._inverse_document_frequency(doc_counts, num_documents)
else:
idf_weights = None
return {_VOCAB_NAME: vocab, _IDF_WEIGHTS_NAME: idf_weights}
def restore(self, output):
"""Create an accumulator based on 'output'."""
raise NotImplementedError(
"IndexLookup does not restore or support streaming updates.")
def serialize(self, accumulator):
"""Serialize an accumulator for a remote call."""
output_dict = {}
output_dict["vocab"] = list(accumulator.count_dict.keys())
output_dict["vocab_counts"] = list(accumulator.count_dict.values())
if self._compute_idf:
output_dict["data"] = accumulator.data
output_dict["idf_vocab"] = list(accumulator.per_doc_count_dict.keys())
output_dict["idf_counts"] = [
counter["count"]
for counter in accumulator.per_doc_count_dict.values()
]
return compat.as_bytes(json.dumps(output_dict))
def deserialize(self, encoded_accumulator):
"""Deserialize an accumulator received from 'serialize()'."""
accumulator_dict = json.loads(compat.as_text(encoded_accumulator))
accumulator = self._create_accumulator()
count_dict = dict(
zip(accumulator_dict["vocab"], accumulator_dict["vocab_counts"]))
accumulator.count_dict.update(count_dict)
if self._compute_idf:
accumulator.data = accumulator_dict["data"]
create_dict = lambda x: {"count": x, "last_doc_id": -1}
idf_count_dicts = [
create_dict(count) for count in accumulator_dict["idf_counts"]
]
idf_dict = dict(zip(accumulator_dict["idf_vocab"], idf_count_dicts))
accumulator.per_doc_count_dict.update(idf_dict)
return accumulator
def _create_accumulator(self):
"""Accumulate a sorted array of vocab tokens and corresponding counts."""
if self._compute_idf:
create_default_dict = lambda: {"count": 0, "last_doc_id": -1}
per_doc_count_dict = collections.defaultdict(create_default_dict)
data = {"next_doc_id": 0}
else:
per_doc_count_dict = None
data = None
count_dict = collections.defaultdict(int)
return _IndexLookupAccumulator(data, count_dict, per_doc_count_dict)
def _inverse_document_frequency(self, document_counts, num_documents):
"""Computes the inverse-document-frequency (IDF) component of TF-IDF.
Uses the default weighting scheme described in
https://en.wikipedia.org/wiki/Tf%E2%80%93idf.
Args:
document_counts: An array of the # of documents each token appears in.
num_documents: An int representing the total number of documents
Returns:
An array of "inverse document frequency" weights.
"""
return np.log(1 + num_documents / (1 + np.array(document_counts)))