tensorflow/python/keras/preprocessing/text.py - platform/external/tensorflow - Git at Google

 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Utilities for text input preprocessing."""
 # pylint: disable=invalid-name

 from keras_preprocessing import text

 from tensorflow.python.keras.preprocessing.text_dataset import text_dataset_from_directory  # pylint: disable=unused-import
 from tensorflow.python.util.tf_export import keras_export

 hashing_trick = text.hashing_trick
 Tokenizer = text.Tokenizer


 @keras_export('keras.preprocessing.text.text_to_word_sequence')
 def text_to_word_sequence(input_text,
                           filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                           lower=True,
                           split=' '):
   """Converts a text to a sequence of words (or tokens).

   This function transforms a string of text into a list of words
   while ignoring `filters` which include punctuations by default.

   >>> sample_text = 'This is a sample sentence.'
   >>> tf.keras.preprocessing.text.text_to_word_sequence(sample_text)
   ['this', 'is', 'a', 'sample', 'sentence']

   Args:
       input_text: Input text (string).
       filters: list (or concatenation) of characters to filter out, such as
           punctuation. Default: ``'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n'``,
             includes basic punctuation, tabs, and newlines.
       lower: boolean. Whether to convert the input to lowercase.
       split: str. Separator for word splitting.

   Returns:
       A list of words (or tokens).
   """
   return text.text_to_word_sequence(
       input_text, filters=filters, lower=lower, split=split)


 @keras_export('keras.preprocessing.text.one_hot')
 def one_hot(input_text,
             n,
             filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
             lower=True,
             split=' '):
   r"""One-hot encodes a text into a list of word indexes of size `n`.

   This function receives as input a string of text and returns a
   list of encoded integers each corresponding to a word (or token)
   in the given input string.

   Args:
       input_text: Input text (string).
       n: int. Size of vocabulary.
       filters: list (or concatenation) of characters to filter out, such as
         punctuation. Default:
         ```
         '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n
         ```,
         includes basic punctuation, tabs, and newlines.
       lower: boolean. Whether to set the text to lowercase.
       split: str. Separator for word splitting.

   Returns:
       List of integers in `[1, n]`. Each integer encodes a word
       (unicity non-guaranteed).
   """
   return text.one_hot(input_text, n, filters=filters, lower=lower, split=split)


 # text.tokenizer_from_json is only available if keras_preprocessing >= 1.1.0
 try:
   tokenizer_from_json = text.tokenizer_from_json
   keras_export('keras.preprocessing.text.tokenizer_from_json')(
       tokenizer_from_json)
 except AttributeError:
   pass

 keras_export('keras.preprocessing.text.hashing_trick')(hashing_trick)
 keras_export('keras.preprocessing.text.Tokenizer')(Tokenizer)
	# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================
	"""Utilities for text input preprocessing."""
	# pylint: disable=invalid-name

	from keras_preprocessing import text

	from tensorflow.python.keras.preprocessing.text_dataset import text_dataset_from_directory # pylint: disable=unused-import
	from tensorflow.python.util.tf_export import keras_export

	hashing_trick = text.hashing_trick
	Tokenizer = text.Tokenizer


	@keras_export('keras.preprocessing.text.text_to_word_sequence')
	def text_to_word_sequence(input_text,
	filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{\|}~\t\n',
	lower=True,
	split=' '):
	"""Converts a text to a sequence of words (or tokens).

	This function transforms a string of text into a list of words
	while ignoring `filters` which include punctuations by default.

	>>> sample_text = 'This is a sample sentence.'
	>>> tf.keras.preprocessing.text.text_to_word_sequence(sample_text)
	['this', 'is', 'a', 'sample', 'sentence']

	Args:
	input_text: Input text (string).
	filters: list (or concatenation) of characters to filter out, such as
	punctuation. Default: ``'!"#$%&()*+,-./:;<=>?@[\\]^_`{\|}~\\t\\n'``,
	includes basic punctuation, tabs, and newlines.
	lower: boolean. Whether to convert the input to lowercase.
	split: str. Separator for word splitting.

	Returns:
	A list of words (or tokens).
	"""
	return text.text_to_word_sequence(
	input_text, filters=filters, lower=lower, split=split)


	@keras_export('keras.preprocessing.text.one_hot')
	def one_hot(input_text,
	n,
	filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{\|}~\t\n',
	lower=True,
	split=' '):
	r"""One-hot encodes a text into a list of word indexes of size `n`.

	This function receives as input a string of text and returns a
	list of encoded integers each corresponding to a word (or token)
	in the given input string.

	Args:
	input_text: Input text (string).
	n: int. Size of vocabulary.
	filters: list (or concatenation) of characters to filter out, such as
	punctuation. Default:
	```
	'!"#$%&()*+,-./:;<=>?@[\]^_`{\|}~\t\n
	```,
	includes basic punctuation, tabs, and newlines.
	lower: boolean. Whether to set the text to lowercase.
	split: str. Separator for word splitting.

	Returns:
	List of integers in `[1, n]`. Each integer encodes a word
	(unicity non-guaranteed).
	"""
	return text.one_hot(input_text, n, filters=filters, lower=lower, split=split)


	# text.tokenizer_from_json is only available if keras_preprocessing >= 1.1.0
	try:
	tokenizer_from_json = text.tokenizer_from_json
	keras_export('keras.preprocessing.text.tokenizer_from_json')(
	tokenizer_from_json)
	except AttributeError:
	pass

	keras_export('keras.preprocessing.text.hashing_trick')(hashing_trick)
	keras_export('keras.preprocessing.text.Tokenizer')(Tokenizer)