tensorflow/examples/speech_commands/wav_to_features.py - platform/external/tensorflow - Git at Google

 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 r"""Converts WAV audio files into input features for neural networks.

 The models used in this example take in two-dimensional spectrograms as the
 input to their neural network portions. For testing and porting purposes it's
 useful to be able to generate these spectrograms outside of the full model, so
 that on-device implementations using their own FFT and streaming code can be
 tested against the version used in training for example. The output is as a
 C source file, so it can be easily linked into an embedded test application.

 To use this, run:

 bazel run tensorflow/examples/speech_commands:wav_to_features -- \
 --input_wav=my.wav --output_c_file=my_wav_data.c

 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

 import argparse
 import os.path
 import sys

 import tensorflow as tf

 import input_data
 import models
 from tensorflow.python.platform import gfile

 FLAGS = None


 def wav_to_features(sample_rate, clip_duration_ms, window_size_ms,
                     window_stride_ms, feature_bin_count, quantize, preprocess,
                     input_wav, output_c_file):
   """Converts an audio file into its corresponding feature map.

   Args:
     sample_rate: Expected sample rate of the wavs.
     clip_duration_ms: Expected duration in milliseconds of the wavs.
     window_size_ms: How long each spectrogram timeslice is.
     window_stride_ms: How far to move in time between spectogram timeslices.
     feature_bin_count: How many bins to use for the feature fingerprint.
     quantize: Whether to train the model for eight-bit deployment.
     preprocess: Spectrogram processing mode; "mfcc", "average" or "micro".
     input_wav: Path to the audio WAV file to read.
     output_c_file: Where to save the generated C source file.
   """

   # Start a new TensorFlow session.
   sess = tf.compat.v1.InteractiveSession()

   model_settings = models.prepare_model_settings(
       0, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms,
       feature_bin_count, preprocess)
   audio_processor = input_data.AudioProcessor(None, None, 0, 0, '', 0, 0,
                                               model_settings, None)

   results = audio_processor.get_features_for_wav(input_wav, model_settings,
                                                  sess)
   features = results[0]

   variable_base = os.path.splitext(os.path.basename(input_wav).lower())[0]

   # Save a C source file containing the feature data as an array.
   with gfile.GFile(output_c_file, 'w') as f:
     f.write('/* File automatically created by\n')
     f.write(' * tensorflow/examples/speech_commands/wav_to_features.py \\\n')
     f.write(' * --sample_rate=%d \\\n' % sample_rate)
     f.write(' * --clip_duration_ms=%d \\\n' % clip_duration_ms)
     f.write(' * --window_size_ms=%d \\\n' % window_size_ms)
     f.write(' * --window_stride_ms=%d \\\n' % window_stride_ms)
     f.write(' * --feature_bin_count=%d \\\n' % feature_bin_count)
     if quantize:
       f.write(' * --quantize=1 \\\n')
     f.write(' * --preprocess="%s" \\\n' % preprocess)
     f.write(' * --input_wav="%s" \\\n' % input_wav)
     f.write(' * --output_c_file="%s" \\\n' % output_c_file)
     f.write(' */\n\n')
     f.write('const int g_%s_width = %d;\n' %
             (variable_base, model_settings['fingerprint_width']))
     f.write('const int g_%s_height = %d;\n' %
             (variable_base, model_settings['spectrogram_length']))
     if quantize:
       features_min, features_max = input_data.get_features_range(model_settings)
       f.write('const unsigned char g_%s_data[] = {' % variable_base)
       i = 0
       for value in features.flatten():
         quantized_value = int(
             round(
                 (255 * (value - features_min)) / (features_max - features_min)))
         if quantized_value < 0:
           quantized_value = 0
         if quantized_value > 255:
           quantized_value = 255
         if i == 0:
           f.write('\n  ')
         f.write('%d, ' % (quantized_value))
         i = (i + 1) % 10
     else:
       f.write('const float g_%s_data[] = {\n' % variable_base)
       i = 0
       for value in features.flatten():
         if i == 0:
           f.write('\n  ')
         f.write(' ,%f' % value)
         i = (i + 1) % 10
     f.write('\n};\n')


 def main(_):
   # We want to see all the logging messages.
   tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
   wav_to_features(FLAGS.sample_rate, FLAGS.clip_duration_ms,
                   FLAGS.window_size_ms, FLAGS.window_stride_ms,
                   FLAGS.feature_bin_count, FLAGS.quantize, FLAGS.preprocess,
                   FLAGS.input_wav, FLAGS.output_c_file)
   tf.compat.v1.logging.info('Wrote to "%s"' % (FLAGS.output_c_file))


 if __name__ == '__main__':
   parser = argparse.ArgumentParser()
   parser.add_argument(
       '--sample_rate',
       type=int,
       default=16000,
       help='Expected sample rate of the wavs',)
   parser.add_argument(
       '--clip_duration_ms',
       type=int,
       default=1000,
       help='Expected duration in milliseconds of the wavs',)
   parser.add_argument(
       '--window_size_ms',
       type=float,
       default=30.0,
       help='How long each spectrogram timeslice is.',)
   parser.add_argument(
       '--window_stride_ms',
       type=float,
       default=10.0,
       help='How far to move in time between spectogram timeslices.',)
   parser.add_argument(
       '--feature_bin_count',
       type=int,
       default=40,
       help='How many bins to use for the MFCC fingerprint',
   )
   parser.add_argument(
       '--quantize',
       type=bool,
       default=False,
       help='Whether to train the model for eight-bit deployment')
   parser.add_argument(
       '--preprocess',
       type=str,
       default='mfcc',
       help='Spectrogram processing mode. Can be "mfcc", "average", or "micro"')
   parser.add_argument(
       '--input_wav',
       type=str,
       default=None,
       help='Path to the audio WAV file to read')
   parser.add_argument(
       '--output_c_file',
       type=str,
       default=None,
       help='Where to save the generated C source file containing the features')

   FLAGS, unparsed = parser.parse_known_args()
   tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)
	# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================
	r"""Converts WAV audio files into input features for neural networks.

	The models used in this example take in two-dimensional spectrograms as the
	input to their neural network portions. For testing and porting purposes it's
	useful to be able to generate these spectrograms outside of the full model, so
	that on-device implementations using their own FFT and streaming code can be
	tested against the version used in training for example. The output is as a
	C source file, so it can be easily linked into an embedded test application.

	To use this, run:

	bazel run tensorflow/examples/speech_commands:wav_to_features -- \
	--input_wav=my.wav --output_c_file=my_wav_data.c

	"""
	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	import argparse
	import os.path
	import sys

	import tensorflow as tf

	import input_data
	import models
	from tensorflow.python.platform import gfile

	FLAGS = None


	def wav_to_features(sample_rate, clip_duration_ms, window_size_ms,
	window_stride_ms, feature_bin_count, quantize, preprocess,
	input_wav, output_c_file):
	"""Converts an audio file into its corresponding feature map.

	Args:
	sample_rate: Expected sample rate of the wavs.
	clip_duration_ms: Expected duration in milliseconds of the wavs.
	window_size_ms: How long each spectrogram timeslice is.
	window_stride_ms: How far to move in time between spectogram timeslices.
	feature_bin_count: How many bins to use for the feature fingerprint.
	quantize: Whether to train the model for eight-bit deployment.
	preprocess: Spectrogram processing mode; "mfcc", "average" or "micro".
	input_wav: Path to the audio WAV file to read.
	output_c_file: Where to save the generated C source file.
	"""

	# Start a new TensorFlow session.
	sess = tf.compat.v1.InteractiveSession()

	model_settings = models.prepare_model_settings(
	0, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms,
	feature_bin_count, preprocess)
	audio_processor = input_data.AudioProcessor(None, None, 0, 0, '', 0, 0,
	model_settings, None)

	results = audio_processor.get_features_for_wav(input_wav, model_settings,
	sess)
	features = results[0]

	variable_base = os.path.splitext(os.path.basename(input_wav).lower())[0]

	# Save a C source file containing the feature data as an array.
	with gfile.GFile(output_c_file, 'w') as f:
	f.write('/* File automatically created by\n')
	f.write(' * tensorflow/examples/speech_commands/wav_to_features.py \\\n')
	f.write(' * --sample_rate=%d \\\n' % sample_rate)
	f.write(' * --clip_duration_ms=%d \\\n' % clip_duration_ms)
	f.write(' * --window_size_ms=%d \\\n' % window_size_ms)
	f.write(' * --window_stride_ms=%d \\\n' % window_stride_ms)
	f.write(' * --feature_bin_count=%d \\\n' % feature_bin_count)
	if quantize:
	f.write(' * --quantize=1 \\\n')
	f.write(' * --preprocess="%s" \\\n' % preprocess)
	f.write(' * --input_wav="%s" \\\n' % input_wav)
	f.write(' * --output_c_file="%s" \\\n' % output_c_file)
	f.write(' */\n\n')
	f.write('const int g_%s_width = %d;\n' %
	(variable_base, model_settings['fingerprint_width']))
	f.write('const int g_%s_height = %d;\n' %
	(variable_base, model_settings['spectrogram_length']))
	if quantize:
	features_min, features_max = input_data.get_features_range(model_settings)
	f.write('const unsigned char g_%s_data[] = {' % variable_base)
	i = 0
	for value in features.flatten():
	quantized_value = int(
	round(
	(255 * (value - features_min)) / (features_max - features_min)))
	if quantized_value < 0:
	quantized_value = 0
	if quantized_value > 255:
	quantized_value = 255
	if i == 0:
	f.write('\n ')
	f.write('%d, ' % (quantized_value))
	i = (i + 1) % 10
	else:
	f.write('const float g_%s_data[] = {\n' % variable_base)
	i = 0
	for value in features.flatten():
	if i == 0:
	f.write('\n ')
	f.write(' ,%f' % value)
	i = (i + 1) % 10
	f.write('\n};\n')


	def main(_):
	# We want to see all the logging messages.
	tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
	wav_to_features(FLAGS.sample_rate, FLAGS.clip_duration_ms,
	FLAGS.window_size_ms, FLAGS.window_stride_ms,
	FLAGS.feature_bin_count, FLAGS.quantize, FLAGS.preprocess,
	FLAGS.input_wav, FLAGS.output_c_file)
	tf.compat.v1.logging.info('Wrote to "%s"' % (FLAGS.output_c_file))


	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument(
	'--sample_rate',
	type=int,
	default=16000,
	help='Expected sample rate of the wavs',)
	parser.add_argument(
	'--clip_duration_ms',
	type=int,
	default=1000,
	help='Expected duration in milliseconds of the wavs',)
	parser.add_argument(
	'--window_size_ms',
	type=float,
	default=30.0,
	help='How long each spectrogram timeslice is.',)
	parser.add_argument(
	'--window_stride_ms',
	type=float,
	default=10.0,
	help='How far to move in time between spectogram timeslices.',)
	parser.add_argument(
	'--feature_bin_count',
	type=int,
	default=40,
	help='How many bins to use for the MFCC fingerprint',
	)
	parser.add_argument(
	'--quantize',
	type=bool,
	default=False,
	help='Whether to train the model for eight-bit deployment')
	parser.add_argument(
	'--preprocess',
	type=str,
	default='mfcc',
	help='Spectrogram processing mode. Can be "mfcc", "average", or "micro"')
	parser.add_argument(
	'--input_wav',
	type=str,
	default=None,
	help='Path to the audio WAV file to read')
	parser.add_argument(
	'--output_c_file',
	type=str,
	default=None,
	help='Where to save the generated C source file containing the features')

	FLAGS, unparsed = parser.parse_known_args()
	tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)