| # Copyright 2017 The TensorFlow Authors. All Rights Reserved. |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # ============================================================================== |
| r"""Saves out a .wav file with synthesized conversational data and labels. |
| |
| The best way to estimate the real-world performance of an audio recognition |
| model is by running it against a continuous stream of data, the way that it |
| would be used in an application. Training evaluations are only run against |
| discrete individual samples, so the results aren't as realistic. |
| |
| To make it easy to run evaluations against audio streams, this script uses |
| samples from the testing partition of the data set, mixes them in at random |
| positions together with background noise, and saves out the result as one long |
| audio file. |
| |
| Here's an example of generating a test file: |
| |
| bazel run tensorflow/examples/speech_commands:generate_streaming_test_wav -- \ |
| --data_dir=/tmp/my_wavs --background_dir=/tmp/my_backgrounds \ |
| --background_volume=0.1 --test_duration_seconds=600 \ |
| --output_audio_file=/tmp/streaming_test.wav \ |
| --output_labels_file=/tmp/streaming_test_labels.txt |
| |
| Once you've created a streaming audio file, you can then use the |
| test_streaming_accuracy tool to calculate accuracy metrics for a model. |
| """ |
| from __future__ import absolute_import |
| from __future__ import division |
| from __future__ import print_function |
| |
| import argparse |
| import math |
| import sys |
| |
| import numpy as np |
| import tensorflow as tf |
| |
| import input_data |
| import models |
| |
| FLAGS = None |
| |
| |
| def mix_in_audio_sample(track_data, track_offset, sample_data, sample_offset, |
| clip_duration, sample_volume, ramp_in, ramp_out): |
| """Mixes the sample data into the main track at the specified offset. |
| |
| Args: |
| track_data: Numpy array holding main audio data. Modified in-place. |
| track_offset: Where to mix the sample into the main track. |
| sample_data: Numpy array of audio data to mix into the main track. |
| sample_offset: Where to start in the audio sample. |
| clip_duration: How long the sample segment is. |
| sample_volume: Loudness to mix the sample in at. |
| ramp_in: Length in samples of volume increase stage. |
| ramp_out: Length in samples of volume decrease stage. |
| """ |
| ramp_out_index = clip_duration - ramp_out |
| track_end = min(track_offset + clip_duration, track_data.shape[0]) |
| track_end = min(track_end, |
| track_offset + (sample_data.shape[0] - sample_offset)) |
| sample_range = track_end - track_offset |
| for i in range(sample_range): |
| if i < ramp_in: |
| envelope_scale = i / ramp_in |
| elif i > ramp_out_index: |
| envelope_scale = (clip_duration - i) / ramp_out |
| else: |
| envelope_scale = 1 |
| sample_input = sample_data[sample_offset + i] |
| track_data[track_offset |
| + i] += sample_input * envelope_scale * sample_volume |
| |
| |
| def main(_): |
| words_list = input_data.prepare_words_list(FLAGS.wanted_words.split(',')) |
| model_settings = models.prepare_model_settings( |
| len(words_list), FLAGS.sample_rate, FLAGS.clip_duration_ms, |
| FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.feature_bin_count, |
| 'mfcc') |
| audio_processor = input_data.AudioProcessor( |
| '', FLAGS.data_dir, FLAGS.silence_percentage, 10, |
| FLAGS.wanted_words.split(','), FLAGS.validation_percentage, |
| FLAGS.testing_percentage, model_settings, FLAGS.data_dir) |
| |
| output_audio_sample_count = FLAGS.sample_rate * FLAGS.test_duration_seconds |
| output_audio = np.zeros((output_audio_sample_count,), dtype=np.float32) |
| |
| # Set up background audio. |
| background_crossover_ms = 500 |
| background_segment_duration_ms = ( |
| FLAGS.clip_duration_ms + background_crossover_ms) |
| background_segment_duration_samples = int( |
| (background_segment_duration_ms * FLAGS.sample_rate) / 1000) |
| background_segment_stride_samples = int( |
| (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000) |
| background_ramp_samples = int( |
| ((background_crossover_ms / 2) * FLAGS.sample_rate) / 1000) |
| |
| # Mix the background audio into the main track. |
| how_many_backgrounds = int( |
| math.ceil(output_audio_sample_count / background_segment_stride_samples)) |
| for i in range(how_many_backgrounds): |
| output_offset = int(i * background_segment_stride_samples) |
| background_index = np.random.randint(len(audio_processor.background_data)) |
| background_samples = audio_processor.background_data[background_index] |
| background_offset = np.random.randint( |
| 0, len(background_samples) - model_settings['desired_samples']) |
| background_volume = np.random.uniform(0, FLAGS.background_volume) |
| mix_in_audio_sample(output_audio, output_offset, background_samples, |
| background_offset, background_segment_duration_samples, |
| background_volume, background_ramp_samples, |
| background_ramp_samples) |
| |
| # Mix the words into the main track, noting their labels and positions. |
| output_labels = [] |
| word_stride_ms = FLAGS.clip_duration_ms + FLAGS.word_gap_ms |
| word_stride_samples = int((word_stride_ms * FLAGS.sample_rate) / 1000) |
| clip_duration_samples = int( |
| (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000) |
| word_gap_samples = int((FLAGS.word_gap_ms * FLAGS.sample_rate) / 1000) |
| how_many_words = int( |
| math.floor(output_audio_sample_count / word_stride_samples)) |
| all_test_data, all_test_labels = audio_processor.get_unprocessed_data( |
| -1, model_settings, 'testing') |
| for i in range(how_many_words): |
| output_offset = ( |
| int(i * word_stride_samples) + np.random.randint(word_gap_samples)) |
| output_offset_ms = (output_offset * 1000) / FLAGS.sample_rate |
| is_unknown = np.random.randint(100) < FLAGS.unknown_percentage |
| if is_unknown: |
| wanted_label = input_data.UNKNOWN_WORD_LABEL |
| else: |
| wanted_label = words_list[2 + np.random.randint(len(words_list) - 2)] |
| test_data_start = np.random.randint(len(all_test_data)) |
| found_sample_data = None |
| index_lookup = np.arange(len(all_test_data), dtype=np.int32) |
| np.random.shuffle(index_lookup) |
| for test_data_offset in range(len(all_test_data)): |
| test_data_index = index_lookup[( |
| test_data_start + test_data_offset) % len(all_test_data)] |
| current_label = all_test_labels[test_data_index] |
| if current_label == wanted_label: |
| found_sample_data = all_test_data[test_data_index] |
| break |
| mix_in_audio_sample(output_audio, output_offset, found_sample_data, 0, |
| clip_duration_samples, 1.0, 500, 500) |
| output_labels.append({'label': wanted_label, 'time': output_offset_ms}) |
| |
| input_data.save_wav_file(FLAGS.output_audio_file, output_audio, |
| FLAGS.sample_rate) |
| tf.compat.v1.logging.info('Saved streaming test wav to %s', FLAGS.output_audio_file) |
| |
| with open(FLAGS.output_labels_file, 'w') as f: |
| for output_label in output_labels: |
| f.write('%s, %f\n' % (output_label['label'], output_label['time'])) |
| tf.compat.v1.logging.info('Saved streaming test labels to %s', FLAGS.output_labels_file) |
| |
| |
| if __name__ == '__main__': |
| parser = argparse.ArgumentParser() |
| parser.add_argument( |
| '--data_url', |
| type=str, |
| # pylint: disable=line-too-long |
| default='http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz', |
| # pylint: enable=line-too-long |
| help='Location of speech training data') |
| parser.add_argument( |
| '--data_dir', |
| type=str, |
| default='/tmp/speech_dataset', |
| help="""\ |
| Where to download the speech training data to. |
| """) |
| parser.add_argument( |
| '--background_dir', |
| type=str, |
| default='', |
| help="""\ |
| Path to a directory of .wav files to mix in as background noise during training. |
| """) |
| parser.add_argument( |
| '--background_volume', |
| type=float, |
| default=0.1, |
| help="""\ |
| How loud the background noise should be, between 0 and 1. |
| """) |
| parser.add_argument( |
| '--background_frequency', |
| type=float, |
| default=0.8, |
| help="""\ |
| How many of the training samples have background noise mixed in. |
| """) |
| parser.add_argument( |
| '--silence_percentage', |
| type=float, |
| default=10.0, |
| help="""\ |
| How much of the training data should be silence. |
| """) |
| parser.add_argument( |
| '--testing_percentage', |
| type=int, |
| default=10, |
| help='What percentage of wavs to use as a test set.') |
| parser.add_argument( |
| '--validation_percentage', |
| type=int, |
| default=10, |
| help='What percentage of wavs to use as a validation set.') |
| parser.add_argument( |
| '--sample_rate', |
| type=int, |
| default=16000, |
| help='Expected sample rate of the wavs.',) |
| parser.add_argument( |
| '--clip_duration_ms', |
| type=int, |
| default=1000, |
| help='Expected duration in milliseconds of the wavs.',) |
| parser.add_argument( |
| '--window_size_ms', |
| type=float, |
| default=30.0, |
| help='How long each spectrogram timeslice is',) |
| parser.add_argument( |
| '--window_stride_ms', |
| type=float, |
| default=10.0, |
| help='How long the stride is between spectrogram timeslices',) |
| parser.add_argument( |
| '--feature_bin_count', |
| type=int, |
| default=40, |
| help='How many bins to use for the MFCC fingerprint', |
| ) |
| parser.add_argument( |
| '--wanted_words', |
| type=str, |
| default='yes,no,up,down,left,right,on,off,stop,go', |
| help='Words to use (others will be added to an unknown label)',) |
| parser.add_argument( |
| '--output_audio_file', |
| type=str, |
| default='/tmp/speech_commands_train/streaming_test.wav', |
| help='File to save the generated test audio to.') |
| parser.add_argument( |
| '--output_labels_file', |
| type=str, |
| default='/tmp/speech_commands_train/streaming_test_labels.txt', |
| help='File to save the generated test labels to.') |
| parser.add_argument( |
| '--test_duration_seconds', |
| type=int, |
| default=600, |
| help='How long the generated test audio file should be.',) |
| parser.add_argument( |
| '--word_gap_ms', |
| type=int, |
| default=2000, |
| help='How long the average gap should be between words.',) |
| parser.add_argument( |
| '--unknown_percentage', |
| type=int, |
| default=30, |
| help='What percentage of words should be unknown.') |
| |
| FLAGS, unparsed = parser.parse_known_args() |
| tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed) |