diff options
Diffstat (limited to 'tensorflow/examples/speech_commands/freeze.py')
-rw-r--r-- | tensorflow/examples/speech_commands/freeze.py | 64 |
1 files changed, 46 insertions, 18 deletions
diff --git a/tensorflow/examples/speech_commands/freeze.py b/tensorflow/examples/speech_commands/freeze.py index c8671d9c41..89e790d4e4 100644 --- a/tensorflow/examples/speech_commands/freeze.py +++ b/tensorflow/examples/speech_commands/freeze.py @@ -54,7 +54,7 @@ FLAGS = None def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, - dct_coefficient_count, model_architecture): + feature_bin_count, model_architecture, preprocess): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and @@ -67,14 +67,19 @@ def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. - dct_coefficient_count: Number of frequency bands to analyze. + feature_bin_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. + preprocess: How the spectrogram is processed to produce features, for + example 'mfcc' or 'average'. + + Raises: + Exception: If the preprocessing mode isn't recognized. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, - window_stride_ms, dct_coefficient_count) + window_stride_ms, feature_bin_count, preprocess) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') @@ -88,15 +93,25 @@ def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) - fingerprint_input = contrib_audio.mfcc( - spectrogram, - decoded_sample_data.sample_rate, - dct_coefficient_count=dct_coefficient_count) - fingerprint_frequency_size = model_settings['dct_coefficient_count'] - fingerprint_time_size = model_settings['spectrogram_length'] - reshaped_input = tf.reshape(fingerprint_input, [ - -1, fingerprint_time_size * fingerprint_frequency_size - ]) + + if preprocess == 'average': + fingerprint_input = tf.nn.pool( + tf.expand_dims(spectrogram, -1), + window_shape=[1, model_settings['average_window_width']], + strides=[1, model_settings['average_window_width']], + pooling_type='AVG', + padding='SAME') + elif preprocess == 'mfcc': + fingerprint_input = contrib_audio.mfcc( + spectrogram, + sample_rate, + dct_coefficient_count=model_settings['fingerprint_width']) + else: + raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or' + ' "average")' % (preprocess)) + + fingerprint_size = model_settings['fingerprint_size'] + reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size]) logits = models.create_model( reshaped_input, model_settings, model_architecture, is_training=False, @@ -110,10 +125,12 @@ def main(_): # Create the model and load its weights. sess = tf.InteractiveSession() - create_inference_graph(FLAGS.wanted_words, FLAGS.sample_rate, - FLAGS.clip_duration_ms, FLAGS.clip_stride_ms, - FLAGS.window_size_ms, FLAGS.window_stride_ms, - FLAGS.dct_coefficient_count, FLAGS.model_architecture) + create_inference_graph( + FLAGS.wanted_words, FLAGS.sample_rate, FLAGS.clip_duration_ms, + FLAGS.clip_stride_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, + FLAGS.feature_bin_count, FLAGS.model_architecture, FLAGS.preprocess) + if FLAGS.quantize: + tf.contrib.quantize.create_eval_graph() models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint) # Turn all the variables into inline constants inside the graph and save it. @@ -155,10 +172,11 @@ if __name__ == '__main__': default=10.0, help='How long the stride is between spectrogram timeslices',) parser.add_argument( - '--dct_coefficient_count', + '--feature_bin_count', type=int, default=40, - help='How many bins to use for the MFCC fingerprint',) + help='How many bins to use for the MFCC fingerprint', + ) parser.add_argument( '--start_checkpoint', type=str, @@ -176,5 +194,15 @@ if __name__ == '__main__': help='Words to use (others will be added to an unknown label)',) parser.add_argument( '--output_file', type=str, help='Where to save the frozen graph.') + parser.add_argument( + '--quantize', + type=bool, + default=False, + help='Whether to train the model for eight-bit deployment') + parser.add_argument( + '--preprocess', + type=str, + default='mfcc', + help='Spectrogram processing mode. Can be "mfcc" or "average"') FLAGS, unparsed = parser.parse_known_args() tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) |