tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py - platform/external/tensorflow - Git at Google

 # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Tensorflow V1 version of the text vectorization preprocessing layer."""

 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

 import numpy as np

 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_preprocessing_layer_v1
 from tensorflow.python.keras.layers.preprocessing import text_vectorization
 from tensorflow.python.ops.ragged import ragged_tensor_value


 class TextVectorization(text_vectorization.TextVectorization,
                         base_preprocessing_layer_v1.CombinerPreprocessingLayer):
   """Text vectorization layer.

   This layer has basic options for managing text in a Keras model. It
   transforms a batch of strings (one sample = one string) into either a list of
   token indices (one sample = 1D tensor of integer token indices) or a dense
   representation (one sample = 1D tensor of float values representing data about
   the sample's tokens).

   The processing of each sample contains the following steps:
     1) standardize each sample (usually lowercasing + punctuation stripping)
     2) split each sample into substrings (usually words)
     3) recombine substrings into tokens (usually ngrams)
     4) index tokens (associate a unique int value with each token)
     5) transform each sample using this index, either into a vector of ints or
        a dense float vector.

   Attributes:
     max_tokens: The maximum size of the vocabulary for this layer. If None,
       there is no cap on the size of the vocabulary.
     standardize: Optional specification for standardization to apply to the
       input text. Values can be None (no standardization),
       LOWER_AND_STRIP_PUNCTUATION (lowercase and remove punctuation) or a
       Callable.
     split: Optional specification for splitting the input text. Values can be
       None (no splitting), SPLIT_ON_WHITESPACE (split on ASCII whitespace), or a
       Callable.
     ngrams: Optional specification for ngrams to create from the possibly-split
       input text. Values can be None, an integer or tuple of integers; passing
       an integer will create ngrams up to that integer, and passing a tuple of
       integers will create ngrams for the specified values in the tuple. Passing
       None means that no ngrams will be created.
     output_mode: Optional specification for the output of the layer. Values can
       be INT, BINARY, COUNT or TFIDF, which control the outputs as follows:
         INT: Outputs integer indices, one integer index per split string token.
         BINARY: Outputs a single int array per batch, of either vocab_size or
           max_tokens size, containing 1s in all elements where the token mapped
           to that index exists at least once in the batch item.
         COUNT: As BINARY, but the int array contains a count of the number of
           times the token at that index appeared in the batch item.
         TFIDF: As BINARY, but the TF-IDF algorithm is applied to find the value
           in each token slot.
     output_sequence_length: Optional length for the output tensor. If set,
       the output will be padded or truncated to this value in INT mode.
     pad_to_max_tokens: If True, BINARY, COUNT, and TFIDF modes will have their
       outputs padded to max_tokens, even if the number of unique tokens in the
       vocabulary is less than max_tokens.
   """

   def _get_table_data(self):
     keys, values = self._table.export()
     np_keys = K.get_session().run(keys)
     np_values = K.get_session().run(values)
     return (np_keys, np_values)

   def _get_table_size(self):
     return K.get_session().run(self._table.size())

   def _clear_table(self):
     keys, _ = self._table.export()
     K.get_session().run(self._table.remove(keys))
     self._has_vocab = False

   def _insert_table_data(self, keys, values):
     K.get_session().run(self._table.insert(keys, values))
     self._has_vocab = True

   def _to_numpy(self, data):
     """Converts preprocessed inputs into numpy arrays."""
     if isinstance(data, np.ndarray):
       return data
     session = K.get_session()
     data = session.run(data)
     if isinstance(data, ragged_tensor_value.RaggedTensorValue):
       data = np.array(data.to_list())
     return data
	# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================
	"""Tensorflow V1 version of the text vectorization preprocessing layer."""

	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	import numpy as np

	from tensorflow.python.keras import backend as K
	from tensorflow.python.keras.engine import base_preprocessing_layer_v1
	from tensorflow.python.keras.layers.preprocessing import text_vectorization
	from tensorflow.python.ops.ragged import ragged_tensor_value


	class TextVectorization(text_vectorization.TextVectorization,
	base_preprocessing_layer_v1.CombinerPreprocessingLayer):
	"""Text vectorization layer.

	This layer has basic options for managing text in a Keras model. It
	transforms a batch of strings (one sample = one string) into either a list of
	token indices (one sample = 1D tensor of integer token indices) or a dense
	representation (one sample = 1D tensor of float values representing data about
	the sample's tokens).

	The processing of each sample contains the following steps:
	1) standardize each sample (usually lowercasing + punctuation stripping)
	2) split each sample into substrings (usually words)
	3) recombine substrings into tokens (usually ngrams)
	4) index tokens (associate a unique int value with each token)
	5) transform each sample using this index, either into a vector of ints or
	a dense float vector.

	Attributes:
	max_tokens: The maximum size of the vocabulary for this layer. If None,
	there is no cap on the size of the vocabulary.
	standardize: Optional specification for standardization to apply to the
	input text. Values can be None (no standardization),
	LOWER_AND_STRIP_PUNCTUATION (lowercase and remove punctuation) or a
	Callable.
	split: Optional specification for splitting the input text. Values can be
	None (no splitting), SPLIT_ON_WHITESPACE (split on ASCII whitespace), or a
	Callable.
	ngrams: Optional specification for ngrams to create from the possibly-split
	input text. Values can be None, an integer or tuple of integers; passing
	an integer will create ngrams up to that integer, and passing a tuple of
	integers will create ngrams for the specified values in the tuple. Passing
	None means that no ngrams will be created.
	output_mode: Optional specification for the output of the layer. Values can
	be INT, BINARY, COUNT or TFIDF, which control the outputs as follows:
	INT: Outputs integer indices, one integer index per split string token.
	BINARY: Outputs a single int array per batch, of either vocab_size or
	max_tokens size, containing 1s in all elements where the token mapped
	to that index exists at least once in the batch item.
	COUNT: As BINARY, but the int array contains a count of the number of
	times the token at that index appeared in the batch item.
	TFIDF: As BINARY, but the TF-IDF algorithm is applied to find the value
	in each token slot.
	output_sequence_length: Optional length for the output tensor. If set,
	the output will be padded or truncated to this value in INT mode.
	pad_to_max_tokens: If True, BINARY, COUNT, and TFIDF modes will have their
	outputs padded to max_tokens, even if the number of unique tokens in the
	vocabulary is less than max_tokens.
	"""

	def _get_table_data(self):
	keys, values = self._table.export()
	np_keys = K.get_session().run(keys)
	np_values = K.get_session().run(values)
	return (np_keys, np_values)

	def _get_table_size(self):
	return K.get_session().run(self._table.size())

	def _clear_table(self):
	keys, _ = self._table.export()
	K.get_session().run(self._table.remove(keys))
	self._has_vocab = False

	def _insert_table_data(self, keys, values):
	K.get_session().run(self._table.insert(keys, values))
	self._has_vocab = True

	def _to_numpy(self, data):
	"""Converts preprocessed inputs into numpy arrays."""
	if isinstance(data, np.ndarray):
	return data
	session = K.get_session()
	data = session.run(data)
	if isinstance(data, ragged_tensor_value.RaggedTensorValue):
	data = np.array(data.to_list())
	return data