1 files changed, 46 insertions, 18 deletions
diff --git a/tensorflow/examples/speech_commands/freeze.py b/tensorflow/examples/speech_commands/freeze.py
index c8671d9c41..89e790d4e4 100644
--- a/tensorflow/examples/speech_commands/freeze.py
+++ b/tensorflow/examples/speech_commands/freeze.py
@@ -54,7 +54,7 @@ FLAGS = None
 
 def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                            clip_stride_ms, window_size_ms, window_stride_ms,
-                           dct_coefficient_count, model_architecture):
+                           feature_bin_count, model_architecture, preprocess):
   """Creates an audio model with the nodes needed for inference.
 
   Uses the supplied arguments to create a model, and inserts the input and
@@ -67,14 +67,19 @@ def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
     clip_stride_ms: How often to run recognition. Useful for models with cache.
     window_size_ms: Time slice duration to estimate frequencies from.
     window_stride_ms: How far apart time slices should be.
-    dct_coefficient_count: Number of frequency bands to analyze.
+    feature_bin_count: Number of frequency bands to analyze.
     model_architecture: Name of the kind of model to generate.
+    preprocess: How the spectrogram is processed to produce features, for
+      example 'mfcc' or 'average'.
+
+  Raises:
+    Exception: If the preprocessing mode isn't recognized.
   """
 
   words_list = input_data.prepare_words_list(wanted_words.split(','))
   model_settings = models.prepare_model_settings(
       len(words_list), sample_rate, clip_duration_ms, window_size_ms,
-      window_stride_ms, dct_coefficient_count)
+      window_stride_ms, feature_bin_count, preprocess)
   runtime_settings = {'clip_stride_ms': clip_stride_ms}
 
   wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
@@ -88,15 +93,25 @@ def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
       window_size=model_settings['window_size_samples'],
       stride=model_settings['window_stride_samples'],
       magnitude_squared=True)
-  fingerprint_input = contrib_audio.mfcc(
-      spectrogram,
-      decoded_sample_data.sample_rate,
-      dct_coefficient_count=dct_coefficient_count)
-  fingerprint_frequency_size = model_settings['dct_coefficient_count']
-  fingerprint_time_size = model_settings['spectrogram_length']
-  reshaped_input = tf.reshape(fingerprint_input, [
-      -1, fingerprint_time_size * fingerprint_frequency_size
-  ])
+
+  if preprocess == 'average':
+    fingerprint_input = tf.nn.pool(
+        tf.expand_dims(spectrogram, -1),
+        window_shape=[1, model_settings['average_window_width']],
+        strides=[1, model_settings['average_window_width']],
+        pooling_type='AVG',
+        padding='SAME')
+  elif preprocess == 'mfcc':
+    fingerprint_input = contrib_audio.mfcc(
+        spectrogram,
+        sample_rate,
+        dct_coefficient_count=model_settings['fingerprint_width'])
+  else:
+    raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or'
+                    ' "average")' % (preprocess))
+
+  fingerprint_size = model_settings['fingerprint_size']
+  reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size])
 
   logits = models.create_model(
       reshaped_input, model_settings, model_architecture, is_training=False,
@@ -110,10 +125,12 @@ def main(_):
 
   # Create the model and load its weights.
   sess = tf.InteractiveSession()
-  create_inference_graph(FLAGS.wanted_words, FLAGS.sample_rate,
-                         FLAGS.clip_duration_ms, FLAGS.clip_stride_ms,
-                         FLAGS.window_size_ms, FLAGS.window_stride_ms,
-                         FLAGS.dct_coefficient_count, FLAGS.model_architecture)
+  create_inference_graph(
+      FLAGS.wanted_words, FLAGS.sample_rate, FLAGS.clip_duration_ms,
+      FLAGS.clip_stride_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms,
+      FLAGS.feature_bin_count, FLAGS.model_architecture, FLAGS.preprocess)
+  if FLAGS.quantize:
+    tf.contrib.quantize.create_eval_graph()
   models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint)
 
   # Turn all the variables into inline constants inside the graph and save it.
@@ -155,10 +172,11 @@ if __name__ == '__main__':
       default=10.0,
       help='How long the stride is between spectrogram timeslices',)
   parser.add_argument(
-      '--dct_coefficient_count',
+      '--feature_bin_count',
       type=int,
       default=40,
-      help='How many bins to use for the MFCC fingerprint',)
+      help='How many bins to use for the MFCC fingerprint',
+  )
   parser.add_argument(
       '--start_checkpoint',
       type=str,
@@ -176,5 +194,15 @@ if __name__ == '__main__':
       help='Words to use (others will be added to an unknown label)',)
   parser.add_argument(
       '--output_file', type=str, help='Where to save the frozen graph.')
+  parser.add_argument(
+      '--quantize',
+      type=bool,
+      default=False,
+      help='Whether to train the model for eight-bit deployment')
+  parser.add_argument(
+      '--preprocess',
+      type=str,
+      default='mfcc',
+      help='Spectrogram processing mode. Can be "mfcc" or "average"')
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)