tensorflow/examples/speech_commands/generate_streaming_test_wav.py - platform/external/tensorflow - Git at Google

 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 r"""Saves out a .wav file with synthesized conversational data and labels.

 The best way to estimate the real-world performance of an audio recognition
 model is by running it against a continuous stream of data, the way that it
 would be used in an application. Training evaluations are only run against
 discrete individual samples, so the results aren't as realistic.

 To make it easy to run evaluations against audio streams, this script uses
 samples from the testing partition of the data set, mixes them in at random
 positions together with background noise, and saves out the result as one long
 audio file.

 Here's an example of generating a test file:

 bazel run tensorflow/examples/speech_commands:generate_streaming_test_wav -- \
 --data_dir=/tmp/my_wavs --background_dir=/tmp/my_backgrounds \
 --background_volume=0.1 --test_duration_seconds=600 \
 --output_audio_file=/tmp/streaming_test.wav \
 --output_labels_file=/tmp/streaming_test_labels.txt

 Once you've created a streaming audio file, you can then use the
 test_streaming_accuracy tool to calculate accuracy metrics for a model.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

 import argparse
 import math
 import sys

 import numpy as np
 import tensorflow as tf

 import input_data
 import models

 FLAGS = None


 def mix_in_audio_sample(track_data, track_offset, sample_data, sample_offset,
                         clip_duration, sample_volume, ramp_in, ramp_out):
   """Mixes the sample data into the main track at the specified offset.

   Args:
     track_data: Numpy array holding main audio data. Modified in-place.
     track_offset: Where to mix the sample into the main track.
     sample_data: Numpy array of audio data to mix into the main track.
     sample_offset: Where to start in the audio sample.
     clip_duration: How long the sample segment is.
     sample_volume: Loudness to mix the sample in at.
     ramp_in: Length in samples of volume increase stage.
     ramp_out: Length in samples of volume decrease stage.
   """
   ramp_out_index = clip_duration - ramp_out
   track_end = min(track_offset + clip_duration, track_data.shape[0])
   track_end = min(track_end,
                   track_offset + (sample_data.shape[0] - sample_offset))
   sample_range = track_end - track_offset
   for i in range(sample_range):
     if i < ramp_in:
       envelope_scale = i / ramp_in
     elif i > ramp_out_index:
       envelope_scale = (clip_duration - i) / ramp_out
     else:
       envelope_scale = 1
     sample_input = sample_data[sample_offset + i]
     track_data[track_offset
                + i] += sample_input * envelope_scale * sample_volume


 def main(_):
   words_list = input_data.prepare_words_list(FLAGS.wanted_words.split(','))
   model_settings = models.prepare_model_settings(
       len(words_list), FLAGS.sample_rate, FLAGS.clip_duration_ms,
       FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.feature_bin_count,
       'mfcc')
   audio_processor = input_data.AudioProcessor(
       '', FLAGS.data_dir, FLAGS.silence_percentage, 10,
       FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
       FLAGS.testing_percentage, model_settings, FLAGS.data_dir)

   output_audio_sample_count = FLAGS.sample_rate * FLAGS.test_duration_seconds
   output_audio = np.zeros((output_audio_sample_count,), dtype=np.float32)

   # Set up background audio.
   background_crossover_ms = 500
   background_segment_duration_ms = (
       FLAGS.clip_duration_ms + background_crossover_ms)
   background_segment_duration_samples = int(
       (background_segment_duration_ms * FLAGS.sample_rate) / 1000)
   background_segment_stride_samples = int(
       (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000)
   background_ramp_samples = int(
       ((background_crossover_ms / 2) * FLAGS.sample_rate) / 1000)

   # Mix the background audio into the main track.
   how_many_backgrounds = int(
       math.ceil(output_audio_sample_count / background_segment_stride_samples))
   for i in range(how_many_backgrounds):
     output_offset = int(i * background_segment_stride_samples)
     background_index = np.random.randint(len(audio_processor.background_data))
     background_samples = audio_processor.background_data[background_index]
     background_offset = np.random.randint(
         0, len(background_samples) - model_settings['desired_samples'])
     background_volume = np.random.uniform(0, FLAGS.background_volume)
     mix_in_audio_sample(output_audio, output_offset, background_samples,
                         background_offset, background_segment_duration_samples,
                         background_volume, background_ramp_samples,
                         background_ramp_samples)

   # Mix the words into the main track, noting their labels and positions.
   output_labels = []
   word_stride_ms = FLAGS.clip_duration_ms + FLAGS.word_gap_ms
   word_stride_samples = int((word_stride_ms * FLAGS.sample_rate) / 1000)
   clip_duration_samples = int(
       (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000)
   word_gap_samples = int((FLAGS.word_gap_ms * FLAGS.sample_rate) / 1000)
   how_many_words = int(
       math.floor(output_audio_sample_count / word_stride_samples))
   all_test_data, all_test_labels = audio_processor.get_unprocessed_data(
       -1, model_settings, 'testing')
   for i in range(how_many_words):
     output_offset = (
         int(i * word_stride_samples) + np.random.randint(word_gap_samples))
     output_offset_ms = (output_offset * 1000) / FLAGS.sample_rate
     is_unknown = np.random.randint(100) < FLAGS.unknown_percentage
     if is_unknown:
       wanted_label = input_data.UNKNOWN_WORD_LABEL
     else:
       wanted_label = words_list[2 + np.random.randint(len(words_list) - 2)]
     test_data_start = np.random.randint(len(all_test_data))
     found_sample_data = None
     index_lookup = np.arange(len(all_test_data), dtype=np.int32)
     np.random.shuffle(index_lookup)
     for test_data_offset in range(len(all_test_data)):
       test_data_index = index_lookup[(
           test_data_start + test_data_offset) % len(all_test_data)]
       current_label = all_test_labels[test_data_index]
       if current_label == wanted_label:
         found_sample_data = all_test_data[test_data_index]
         break
     mix_in_audio_sample(output_audio, output_offset, found_sample_data, 0,
                         clip_duration_samples, 1.0, 500, 500)
     output_labels.append({'label': wanted_label, 'time': output_offset_ms})

   input_data.save_wav_file(FLAGS.output_audio_file, output_audio,
                            FLAGS.sample_rate)
   tf.compat.v1.logging.info('Saved streaming test wav to %s', FLAGS.output_audio_file)

   with open(FLAGS.output_labels_file, 'w') as f:
     for output_label in output_labels:
       f.write('%s, %f\n' % (output_label['label'], output_label['time']))
   tf.compat.v1.logging.info('Saved streaming test labels to %s', FLAGS.output_labels_file)


 if __name__ == '__main__':
   parser = argparse.ArgumentParser()
   parser.add_argument(
       '--data_url',
       type=str,
       # pylint: disable=line-too-long
       default='http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz',
       # pylint: enable=line-too-long
       help='Location of speech training data')
   parser.add_argument(
       '--data_dir',
       type=str,
       default='/tmp/speech_dataset',
       help="""\
       Where to download the speech training data to.
       """)
   parser.add_argument(
       '--background_dir',
       type=str,
       default='',
       help="""\
       Path to a directory of .wav files to mix in as background noise during training.
       """)
   parser.add_argument(
       '--background_volume',
       type=float,
       default=0.1,
       help="""\
       How loud the background noise should be, between 0 and 1.
       """)
   parser.add_argument(
       '--background_frequency',
       type=float,
       default=0.8,
       help="""\
       How many of the training samples have background noise mixed in.
       """)
   parser.add_argument(
       '--silence_percentage',
       type=float,
       default=10.0,
       help="""\
       How much of the training data should be silence.
       """)
   parser.add_argument(
       '--testing_percentage',
       type=int,
       default=10,
       help='What percentage of wavs to use as a test set.')
   parser.add_argument(
       '--validation_percentage',
       type=int,
       default=10,
       help='What percentage of wavs to use as a validation set.')
   parser.add_argument(
       '--sample_rate',
       type=int,
       default=16000,
       help='Expected sample rate of the wavs.',)
   parser.add_argument(
       '--clip_duration_ms',
       type=int,
       default=1000,
       help='Expected duration in milliseconds of the wavs.',)
   parser.add_argument(
       '--window_size_ms',
       type=float,
       default=30.0,
       help='How long each spectrogram timeslice is',)
   parser.add_argument(
       '--window_stride_ms',
       type=float,
       default=10.0,
       help='How long the stride is between spectrogram timeslices',)
   parser.add_argument(
       '--feature_bin_count',
       type=int,
       default=40,
       help='How many bins to use for the MFCC fingerprint',
   )
   parser.add_argument(
       '--wanted_words',
       type=str,
       default='yes,no,up,down,left,right,on,off,stop,go',
       help='Words to use (others will be added to an unknown label)',)
   parser.add_argument(
       '--output_audio_file',
       type=str,
       default='/tmp/speech_commands_train/streaming_test.wav',
       help='File to save the generated test audio to.')
   parser.add_argument(
       '--output_labels_file',
       type=str,
       default='/tmp/speech_commands_train/streaming_test_labels.txt',
       help='File to save the generated test labels to.')
   parser.add_argument(
       '--test_duration_seconds',
       type=int,
       default=600,
       help='How long the generated test audio file should be.',)
   parser.add_argument(
       '--word_gap_ms',
       type=int,
       default=2000,
       help='How long the average gap should be between words.',)
   parser.add_argument(
       '--unknown_percentage',
       type=int,
       default=30,
       help='What percentage of words should be unknown.')

   FLAGS, unparsed = parser.parse_known_args()
   tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)
	# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================
	r"""Saves out a .wav file with synthesized conversational data and labels.

	The best way to estimate the real-world performance of an audio recognition
	model is by running it against a continuous stream of data, the way that it
	would be used in an application. Training evaluations are only run against
	discrete individual samples, so the results aren't as realistic.

	To make it easy to run evaluations against audio streams, this script uses
	samples from the testing partition of the data set, mixes them in at random
	positions together with background noise, and saves out the result as one long
	audio file.

	Here's an example of generating a test file:

	bazel run tensorflow/examples/speech_commands:generate_streaming_test_wav -- \
	--data_dir=/tmp/my_wavs --background_dir=/tmp/my_backgrounds \
	--background_volume=0.1 --test_duration_seconds=600 \
	--output_audio_file=/tmp/streaming_test.wav \
	--output_labels_file=/tmp/streaming_test_labels.txt

	Once you've created a streaming audio file, you can then use the
	test_streaming_accuracy tool to calculate accuracy metrics for a model.
	"""
	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	import argparse
	import math
	import sys

	import numpy as np
	import tensorflow as tf

	import input_data
	import models

	FLAGS = None


	def mix_in_audio_sample(track_data, track_offset, sample_data, sample_offset,
	clip_duration, sample_volume, ramp_in, ramp_out):
	"""Mixes the sample data into the main track at the specified offset.

	Args:
	track_data: Numpy array holding main audio data. Modified in-place.
	track_offset: Where to mix the sample into the main track.
	sample_data: Numpy array of audio data to mix into the main track.
	sample_offset: Where to start in the audio sample.
	clip_duration: How long the sample segment is.
	sample_volume: Loudness to mix the sample in at.
	ramp_in: Length in samples of volume increase stage.
	ramp_out: Length in samples of volume decrease stage.
	"""
	ramp_out_index = clip_duration - ramp_out
	track_end = min(track_offset + clip_duration, track_data.shape[0])
	track_end = min(track_end,
	track_offset + (sample_data.shape[0] - sample_offset))
	sample_range = track_end - track_offset
	for i in range(sample_range):
	if i < ramp_in:
	envelope_scale = i / ramp_in
	elif i > ramp_out_index:
	envelope_scale = (clip_duration - i) / ramp_out
	else:
	envelope_scale = 1
	sample_input = sample_data[sample_offset + i]
	track_data[track_offset
	+ i] += sample_input * envelope_scale * sample_volume


	def main(_):
	words_list = input_data.prepare_words_list(FLAGS.wanted_words.split(','))
	model_settings = models.prepare_model_settings(
	len(words_list), FLAGS.sample_rate, FLAGS.clip_duration_ms,
	FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.feature_bin_count,
	'mfcc')
	audio_processor = input_data.AudioProcessor(
	'', FLAGS.data_dir, FLAGS.silence_percentage, 10,
	FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
	FLAGS.testing_percentage, model_settings, FLAGS.data_dir)

	output_audio_sample_count = FLAGS.sample_rate * FLAGS.test_duration_seconds
	output_audio = np.zeros((output_audio_sample_count,), dtype=np.float32)

	# Set up background audio.
	background_crossover_ms = 500
	background_segment_duration_ms = (
	FLAGS.clip_duration_ms + background_crossover_ms)
	background_segment_duration_samples = int(
	(background_segment_duration_ms * FLAGS.sample_rate) / 1000)
	background_segment_stride_samples = int(
	(FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000)
	background_ramp_samples = int(
	((background_crossover_ms / 2) * FLAGS.sample_rate) / 1000)

	# Mix the background audio into the main track.
	how_many_backgrounds = int(
	math.ceil(output_audio_sample_count / background_segment_stride_samples))
	for i in range(how_many_backgrounds):
	output_offset = int(i * background_segment_stride_samples)
	background_index = np.random.randint(len(audio_processor.background_data))
	background_samples = audio_processor.background_data[background_index]
	background_offset = np.random.randint(
	0, len(background_samples) - model_settings['desired_samples'])
	background_volume = np.random.uniform(0, FLAGS.background_volume)
	mix_in_audio_sample(output_audio, output_offset, background_samples,
	background_offset, background_segment_duration_samples,
	background_volume, background_ramp_samples,
	background_ramp_samples)

	# Mix the words into the main track, noting their labels and positions.
	output_labels = []
	word_stride_ms = FLAGS.clip_duration_ms + FLAGS.word_gap_ms
	word_stride_samples = int((word_stride_ms * FLAGS.sample_rate) / 1000)
	clip_duration_samples = int(
	(FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000)
	word_gap_samples = int((FLAGS.word_gap_ms * FLAGS.sample_rate) / 1000)
	how_many_words = int(
	math.floor(output_audio_sample_count / word_stride_samples))
	all_test_data, all_test_labels = audio_processor.get_unprocessed_data(
	-1, model_settings, 'testing')
	for i in range(how_many_words):
	output_offset = (
	int(i * word_stride_samples) + np.random.randint(word_gap_samples))
	output_offset_ms = (output_offset * 1000) / FLAGS.sample_rate
	is_unknown = np.random.randint(100) < FLAGS.unknown_percentage
	if is_unknown:
	wanted_label = input_data.UNKNOWN_WORD_LABEL
	else:
	wanted_label = words_list[2 + np.random.randint(len(words_list) - 2)]
	test_data_start = np.random.randint(len(all_test_data))
	found_sample_data = None
	index_lookup = np.arange(len(all_test_data), dtype=np.int32)
	np.random.shuffle(index_lookup)
	for test_data_offset in range(len(all_test_data)):
	test_data_index = index_lookup[(
	test_data_start + test_data_offset) % len(all_test_data)]
	current_label = all_test_labels[test_data_index]
	if current_label == wanted_label:
	found_sample_data = all_test_data[test_data_index]
	break
	mix_in_audio_sample(output_audio, output_offset, found_sample_data, 0,
	clip_duration_samples, 1.0, 500, 500)
	output_labels.append({'label': wanted_label, 'time': output_offset_ms})

	input_data.save_wav_file(FLAGS.output_audio_file, output_audio,
	FLAGS.sample_rate)
	tf.compat.v1.logging.info('Saved streaming test wav to %s', FLAGS.output_audio_file)

	with open(FLAGS.output_labels_file, 'w') as f:
	for output_label in output_labels:
	f.write('%s, %f\n' % (output_label['label'], output_label['time']))
	tf.compat.v1.logging.info('Saved streaming test labels to %s', FLAGS.output_labels_file)


	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument(
	'--data_url',
	type=str,
	# pylint: disable=line-too-long
	default='http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz',
	# pylint: enable=line-too-long
	help='Location of speech training data')
	parser.add_argument(
	'--data_dir',
	type=str,
	default='/tmp/speech_dataset',
	help="""\
	Where to download the speech training data to.
	""")
	parser.add_argument(
	'--background_dir',
	type=str,
	default='',
	help="""\
	Path to a directory of .wav files to mix in as background noise during training.
	""")
	parser.add_argument(
	'--background_volume',
	type=float,
	default=0.1,
	help="""\
	How loud the background noise should be, between 0 and 1.
	""")
	parser.add_argument(
	'--background_frequency',
	type=float,
	default=0.8,
	help="""\
	How many of the training samples have background noise mixed in.
	""")
	parser.add_argument(
	'--silence_percentage',
	type=float,
	default=10.0,
	help="""\
	How much of the training data should be silence.
	""")
	parser.add_argument(
	'--testing_percentage',
	type=int,
	default=10,
	help='What percentage of wavs to use as a test set.')
	parser.add_argument(
	'--validation_percentage',
	type=int,
	default=10,
	help='What percentage of wavs to use as a validation set.')
	parser.add_argument(
	'--sample_rate',
	type=int,
	default=16000,
	help='Expected sample rate of the wavs.',)
	parser.add_argument(
	'--clip_duration_ms',
	type=int,
	default=1000,
	help='Expected duration in milliseconds of the wavs.',)
	parser.add_argument(
	'--window_size_ms',
	type=float,
	default=30.0,
	help='How long each spectrogram timeslice is',)
	parser.add_argument(
	'--window_stride_ms',
	type=float,
	default=10.0,
	help='How long the stride is between spectrogram timeslices',)
	parser.add_argument(
	'--feature_bin_count',
	type=int,
	default=40,
	help='How many bins to use for the MFCC fingerprint',
	)
	parser.add_argument(
	'--wanted_words',
	type=str,
	default='yes,no,up,down,left,right,on,off,stop,go',
	help='Words to use (others will be added to an unknown label)',)
	parser.add_argument(
	'--output_audio_file',
	type=str,
	default='/tmp/speech_commands_train/streaming_test.wav',
	help='File to save the generated test audio to.')
	parser.add_argument(
	'--output_labels_file',
	type=str,
	default='/tmp/speech_commands_train/streaming_test_labels.txt',
	help='File to save the generated test labels to.')
	parser.add_argument(
	'--test_duration_seconds',
	type=int,
	default=600,
	help='How long the generated test audio file should be.',)
	parser.add_argument(
	'--word_gap_ms',
	type=int,
	default=2000,
	help='How long the average gap should be between words.',)
	parser.add_argument(
	'--unknown_percentage',
	type=int,
	default=30,
	help='What percentage of words should be unknown.')

	FLAGS, unparsed = parser.parse_known_args()
	tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)