aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/examples/speech_commands/freeze.py
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/examples/speech_commands/freeze.py')
-rw-r--r--tensorflow/examples/speech_commands/freeze.py64
1 files changed, 46 insertions, 18 deletions
diff --git a/tensorflow/examples/speech_commands/freeze.py b/tensorflow/examples/speech_commands/freeze.py
index c8671d9c41..89e790d4e4 100644
--- a/tensorflow/examples/speech_commands/freeze.py
+++ b/tensorflow/examples/speech_commands/freeze.py
@@ -54,7 +54,7 @@ FLAGS = None
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
clip_stride_ms, window_size_ms, window_stride_ms,
- dct_coefficient_count, model_architecture):
+ feature_bin_count, model_architecture, preprocess):
"""Creates an audio model with the nodes needed for inference.
Uses the supplied arguments to create a model, and inserts the input and
@@ -67,14 +67,19 @@ def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
clip_stride_ms: How often to run recognition. Useful for models with cache.
window_size_ms: Time slice duration to estimate frequencies from.
window_stride_ms: How far apart time slices should be.
- dct_coefficient_count: Number of frequency bands to analyze.
+ feature_bin_count: Number of frequency bands to analyze.
model_architecture: Name of the kind of model to generate.
+ preprocess: How the spectrogram is processed to produce features, for
+ example 'mfcc' or 'average'.
+
+ Raises:
+ Exception: If the preprocessing mode isn't recognized.
"""
words_list = input_data.prepare_words_list(wanted_words.split(','))
model_settings = models.prepare_model_settings(
len(words_list), sample_rate, clip_duration_ms, window_size_ms,
- window_stride_ms, dct_coefficient_count)
+ window_stride_ms, feature_bin_count, preprocess)
runtime_settings = {'clip_stride_ms': clip_stride_ms}
wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
@@ -88,15 +93,25 @@ def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
window_size=model_settings['window_size_samples'],
stride=model_settings['window_stride_samples'],
magnitude_squared=True)
- fingerprint_input = contrib_audio.mfcc(
- spectrogram,
- decoded_sample_data.sample_rate,
- dct_coefficient_count=dct_coefficient_count)
- fingerprint_frequency_size = model_settings['dct_coefficient_count']
- fingerprint_time_size = model_settings['spectrogram_length']
- reshaped_input = tf.reshape(fingerprint_input, [
- -1, fingerprint_time_size * fingerprint_frequency_size
- ])
+
+ if preprocess == 'average':
+ fingerprint_input = tf.nn.pool(
+ tf.expand_dims(spectrogram, -1),
+ window_shape=[1, model_settings['average_window_width']],
+ strides=[1, model_settings['average_window_width']],
+ pooling_type='AVG',
+ padding='SAME')
+ elif preprocess == 'mfcc':
+ fingerprint_input = contrib_audio.mfcc(
+ spectrogram,
+ sample_rate,
+ dct_coefficient_count=model_settings['fingerprint_width'])
+ else:
+ raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or'
+ ' "average")' % (preprocess))
+
+ fingerprint_size = model_settings['fingerprint_size']
+ reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size])
logits = models.create_model(
reshaped_input, model_settings, model_architecture, is_training=False,
@@ -110,10 +125,12 @@ def main(_):
# Create the model and load its weights.
sess = tf.InteractiveSession()
- create_inference_graph(FLAGS.wanted_words, FLAGS.sample_rate,
- FLAGS.clip_duration_ms, FLAGS.clip_stride_ms,
- FLAGS.window_size_ms, FLAGS.window_stride_ms,
- FLAGS.dct_coefficient_count, FLAGS.model_architecture)
+ create_inference_graph(
+ FLAGS.wanted_words, FLAGS.sample_rate, FLAGS.clip_duration_ms,
+ FLAGS.clip_stride_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms,
+ FLAGS.feature_bin_count, FLAGS.model_architecture, FLAGS.preprocess)
+ if FLAGS.quantize:
+ tf.contrib.quantize.create_eval_graph()
models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint)
# Turn all the variables into inline constants inside the graph and save it.
@@ -155,10 +172,11 @@ if __name__ == '__main__':
default=10.0,
help='How long the stride is between spectrogram timeslices',)
parser.add_argument(
- '--dct_coefficient_count',
+ '--feature_bin_count',
type=int,
default=40,
- help='How many bins to use for the MFCC fingerprint',)
+ help='How many bins to use for the MFCC fingerprint',
+ )
parser.add_argument(
'--start_checkpoint',
type=str,
@@ -176,5 +194,15 @@ if __name__ == '__main__':
help='Words to use (others will be added to an unknown label)',)
parser.add_argument(
'--output_file', type=str, help='Where to save the frozen graph.')
+ parser.add_argument(
+ '--quantize',
+ type=bool,
+ default=False,
+ help='Whether to train the model for eight-bit deployment')
+ parser.add_argument(
+ '--preprocess',
+ type=str,
+ default='mfcc',
+ help='Spectrogram processing mode. Can be "mfcc" or "average"')
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)