tensorflow/python/keras/preprocessing/sequence.py - platform/external/tensorflow - Git at Google

 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Utilities for preprocessing sequence data."""
 # pylint: disable=invalid-name

 from keras_preprocessing import sequence

 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.util.tf_export import keras_export

 make_sampling_table = sequence.make_sampling_table
 skipgrams = sequence.skipgrams
 # TODO(fchollet): consider making `_remove_long_seq` public.
 _remove_long_seq = sequence._remove_long_seq  # pylint: disable=protected-access


 @keras_export('keras.preprocessing.sequence.TimeseriesGenerator')
 class TimeseriesGenerator(sequence.TimeseriesGenerator, data_utils.Sequence):
   """Utility class for generating batches of temporal data.

   This class takes in a sequence of data-points gathered at
   equal intervals, along with time series parameters such as
   stride, length of history, etc., to produce batches for
   training/validation.
   # Arguments
       data: Indexable generator (such as list or Numpy array)
           containing consecutive data points (timesteps).
           The data should be at 2D, and axis 0 is expected
           to be the time dimension.
       targets: Targets corresponding to timesteps in `data`.
           It should have same length as `data`.
       length: Length of the output sequences (in number of timesteps).
       sampling_rate: Period between successive individual timesteps
           within sequences. For rate `r`, timesteps
           `data[i]`, `data[i-r]`, ... `data[i - length]`
           are used for create a sample sequence.
       stride: Period between successive output sequences.
           For stride `s`, consecutive output samples would
           be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc.
       start_index: Data points earlier than `start_index` will not be used
           in the output sequences. This is useful to reserve part of the
           data for test or validation.
       end_index: Data points later than `end_index` will not be used
           in the output sequences. This is useful to reserve part of the
           data for test or validation.
       shuffle: Whether to shuffle output samples,
           or instead draw them in chronological order.
       reverse: Boolean: if `true`, timesteps in each output sample will be
           in reverse chronological order.
       batch_size: Number of timeseries samples in each batch
           (except maybe the last one).
   # Returns
       A [Sequence](https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence) instance.
   # Examples
   ```python
   from keras.preprocessing.sequence import TimeseriesGenerator
   import numpy as np
   data = np.array([[i] for i in range(50)])
   targets = np.array([[i] for i in range(50)])
   data_gen = TimeseriesGenerator(data, targets,
                                  length=10, sampling_rate=2,
                                  batch_size=2)
   assert len(data_gen) == 20
   batch_0 = data_gen[0]
   x, y = batch_0
   assert np.array_equal(x,
                         np.array([[[0], [2], [4], [6], [8]],
                                   [[1], [3], [5], [7], [9]]]))
   assert np.array_equal(y,
                         np.array([[10], [11]]))
   ```
   """
   pass


 @keras_export('keras.preprocessing.sequence.pad_sequences')
 def pad_sequences(sequences, maxlen=None, dtype='int32',
                   padding='pre', truncating='pre', value=0.):
   """Pads sequences to the same length.

   This function transforms a list (of length `num_samples`)
   of sequences (lists of integers)
   into a 2D Numpy array of shape `(num_samples, num_timesteps)`.
   `num_timesteps` is either the `maxlen` argument if provided,
   or the length of the longest sequence in the list.

   Sequences that are shorter than `num_timesteps`
   are padded with `value` until they are `num_timesteps` long.

   Sequences longer than `num_timesteps` are truncated
   so that they fit the desired length.

   The position where padding or truncation happens is determined by
   the arguments `padding` and `truncating`, respectively.
   Pre-padding or removing values from the beginning of the sequence is the
   default.

   >>> sequence = [[1], [2, 3], [4, 5, 6]]
   >>> tf.keras.preprocessing.sequence.pad_sequences(sequence)
   array([[0, 0, 1],
          [0, 2, 3],
          [4, 5, 6]], dtype=int32)

   >>> tf.keras.preprocessing.sequence.pad_sequences(sequence, value=-1)
   array([[-1, -1,  1],
          [-1,  2,  3],
          [ 4,  5,  6]], dtype=int32)

   >>> tf.keras.preprocessing.sequence.pad_sequences(sequence, padding='post')
   array([[1, 0, 0],
          [2, 3, 0],
          [4, 5, 6]], dtype=int32)

   >>> tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=2)
   array([[0, 1],
          [2, 3],
          [5, 6]], dtype=int32)

   Args:
       sequences: List of sequences (each sequence is a list of integers).
       maxlen: Optional Int, maximum length of all sequences. If not provided,
           sequences will be padded to the length of the longest individual
           sequence.
       dtype: (Optional, defaults to int32). Type of the output sequences.
           To pad sequences with variable length strings, you can use `object`.
       padding: String, 'pre' or 'post' (optional, defaults to 'pre'):
           pad either before or after each sequence.
       truncating: String, 'pre' or 'post' (optional, defaults to 'pre'):
           remove values from sequences larger than
           `maxlen`, either at the beginning or at the end of the sequences.
       value: Float or String, padding value. (Optional, defaults to 0.)

   Returns:
       Numpy array with shape `(len(sequences), maxlen)`

   Raises:
       ValueError: In case of invalid values for `truncating` or `padding`,
           or in case of invalid shape for a `sequences` entry.
   """
   return sequence.pad_sequences(
       sequences, maxlen=maxlen, dtype=dtype,
       padding=padding, truncating=truncating, value=value)

 keras_export(
     'keras.preprocessing.sequence.make_sampling_table')(make_sampling_table)
 keras_export('keras.preprocessing.sequence.skipgrams')(skipgrams)
	# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================
	"""Utilities for preprocessing sequence data."""
	# pylint: disable=invalid-name

	from keras_preprocessing import sequence

	from tensorflow.python.keras.utils import data_utils
	from tensorflow.python.util.tf_export import keras_export

	make_sampling_table = sequence.make_sampling_table
	skipgrams = sequence.skipgrams
	# TODO(fchollet): consider making `_remove_long_seq` public.
	_remove_long_seq = sequence._remove_long_seq # pylint: disable=protected-access


	@keras_export('keras.preprocessing.sequence.TimeseriesGenerator')
	class TimeseriesGenerator(sequence.TimeseriesGenerator, data_utils.Sequence):
	"""Utility class for generating batches of temporal data.

	This class takes in a sequence of data-points gathered at
	equal intervals, along with time series parameters such as
	stride, length of history, etc., to produce batches for
	training/validation.
	# Arguments
	data: Indexable generator (such as list or Numpy array)
	containing consecutive data points (timesteps).
	The data should be at 2D, and axis 0 is expected
	to be the time dimension.
	targets: Targets corresponding to timesteps in `data`.
	It should have same length as `data`.
	length: Length of the output sequences (in number of timesteps).
	sampling_rate: Period between successive individual timesteps
	within sequences. For rate `r`, timesteps
	`data[i]`, `data[i-r]`, ... `data[i - length]`
	are used for create a sample sequence.
	stride: Period between successive output sequences.
	For stride `s`, consecutive output samples would
	be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc.
	start_index: Data points earlier than `start_index` will not be used
	in the output sequences. This is useful to reserve part of the
	data for test or validation.
	end_index: Data points later than `end_index` will not be used
	in the output sequences. This is useful to reserve part of the
	data for test or validation.
	shuffle: Whether to shuffle output samples,
	or instead draw them in chronological order.
	reverse: Boolean: if `true`, timesteps in each output sample will be
	in reverse chronological order.
	batch_size: Number of timeseries samples in each batch
	(except maybe the last one).
	# Returns
	A [Sequence](https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence) instance.
	# Examples
	```python
	from keras.preprocessing.sequence import TimeseriesGenerator
	import numpy as np
	data = np.array([[i] for i in range(50)])
	targets = np.array([[i] for i in range(50)])
	data_gen = TimeseriesGenerator(data, targets,
	length=10, sampling_rate=2,
	batch_size=2)
	assert len(data_gen) == 20
	batch_0 = data_gen[0]
	x, y = batch_0
	assert np.array_equal(x,
	np.array([[[0], [2], [4], [6], [8]],
	[[1], [3], [5], [7], [9]]]))
	assert np.array_equal(y,
	np.array([[10], [11]]))
	```
	"""
	pass


	@keras_export('keras.preprocessing.sequence.pad_sequences')
	def pad_sequences(sequences, maxlen=None, dtype='int32',
	padding='pre', truncating='pre', value=0.):
	"""Pads sequences to the same length.

	This function transforms a list (of length `num_samples`)
	of sequences (lists of integers)
	into a 2D Numpy array of shape `(num_samples, num_timesteps)`.
	`num_timesteps` is either the `maxlen` argument if provided,
	or the length of the longest sequence in the list.

	Sequences that are shorter than `num_timesteps`
	are padded with `value` until they are `num_timesteps` long.

	Sequences longer than `num_timesteps` are truncated
	so that they fit the desired length.

	The position where padding or truncation happens is determined by
	the arguments `padding` and `truncating`, respectively.
	Pre-padding or removing values from the beginning of the sequence is the
	default.

	>>> sequence = [[1], [2, 3], [4, 5, 6]]
	>>> tf.keras.preprocessing.sequence.pad_sequences(sequence)
	array([[0, 0, 1],
	[0, 2, 3],
	[4, 5, 6]], dtype=int32)

	>>> tf.keras.preprocessing.sequence.pad_sequences(sequence, value=-1)
	array([[-1, -1, 1],
	[-1, 2, 3],
	[ 4, 5, 6]], dtype=int32)

	>>> tf.keras.preprocessing.sequence.pad_sequences(sequence, padding='post')
	array([[1, 0, 0],
	[2, 3, 0],
	[4, 5, 6]], dtype=int32)

	>>> tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=2)
	array([[0, 1],
	[2, 3],
	[5, 6]], dtype=int32)

	Args:
	sequences: List of sequences (each sequence is a list of integers).
	maxlen: Optional Int, maximum length of all sequences. If not provided,
	sequences will be padded to the length of the longest individual
	sequence.
	dtype: (Optional, defaults to int32). Type of the output sequences.
	To pad sequences with variable length strings, you can use `object`.
	padding: String, 'pre' or 'post' (optional, defaults to 'pre'):
	pad either before or after each sequence.
	truncating: String, 'pre' or 'post' (optional, defaults to 'pre'):
	remove values from sequences larger than
	`maxlen`, either at the beginning or at the end of the sequences.
	value: Float or String, padding value. (Optional, defaults to 0.)

	Returns:
	Numpy array with shape `(len(sequences), maxlen)`

	Raises:
	ValueError: In case of invalid values for `truncating` or `padding`,
	or in case of invalid shape for a `sequences` entry.
	"""
	return sequence.pad_sequences(
	sequences, maxlen=maxlen, dtype=dtype,
	padding=padding, truncating=truncating, value=value)

	keras_export(
	'keras.preprocessing.sequence.make_sampling_table')(make_sampling_table)
	keras_export('keras.preprocessing.sequence.skipgrams')(skipgrams)