tensorflow/examples/speech_commands/generate_streaming_test_wav.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281

# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Saves out a .wav file with synthesized conversational data and labels.

The best way to estimate the real-world performance of an audio recognition
model is by running it against a continuous stream of data, the way that it
would be used in an application. Training evaluations are only run against
discrete individual samples, so the results aren't as realistic.

To make it easy to run evaluations against audio streams, this script uses
samples from the testing partition of the data set, mixes them in at random
positions together with background noise, and saves out the result as one long
audio file.

Here's an example of generating a test file:

bazel run tensorflow/examples/speech_commands:generate_streaming_test_wav -- \
--data_dir=/tmp/my_wavs --background_dir=/tmp/my_backgrounds \
--background_volume=0.1 --test_duration_seconds=600 \
--output_audio_file=/tmp/streaming_test.wav \
--output_labels_file=/tmp/streaming_test_labels.txt

Once you've created a streaming audio file, you can then use the
test_streaming_accuracy tool to calculate accuracy metrics for a model.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import math
import sys

import numpy as np
import tensorflow as tf

import input_data
import models

FLAGS = None


def mix_in_audio_sample(track_data, track_offset, sample_data, sample_offset,
                        clip_duration, sample_volume, ramp_in, ramp_out):
  """Mixes the sample data into the main track at the specified offset.

  Args:
    track_data: Numpy array holding main audio data. Modified in-place.
    track_offset: Where to mix the sample into the main track.
    sample_data: Numpy array of audio data to mix into the main track.
    sample_offset: Where to start in the audio sample.
    clip_duration: How long the sample segment is.
    sample_volume: Loudness to mix the sample in at.
    ramp_in: Length in samples of volume increase stage.
    ramp_out: Length in samples of volume decrease stage.
  """
  ramp_out_index = clip_duration - ramp_out
  track_end = min(track_offset + clip_duration, track_data.shape[0])
  track_end = min(track_end,
                  track_offset + (sample_data.shape[0] - sample_offset))
  sample_range = track_end - track_offset
  for i in range(sample_range):
    if i < ramp_in:
      envelope_scale = i / ramp_in
    elif i > ramp_out_index:
      envelope_scale = (clip_duration - i) / ramp_out
    else:
      envelope_scale = 1
    sample_input = sample_data[sample_offset + i]
    track_data[track_offset
               + i] += sample_input * envelope_scale * sample_volume


def main(_):
  words_list = input_data.prepare_words_list(FLAGS.wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), FLAGS.sample_rate, FLAGS.clip_duration_ms,
      FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
  audio_processor = input_data.AudioProcessor(
      '', FLAGS.data_dir, FLAGS.silence_percentage, 10,
      FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
      FLAGS.testing_percentage, model_settings)

  output_audio_sample_count = FLAGS.sample_rate * FLAGS.test_duration_seconds
  output_audio = np.zeros((output_audio_sample_count,), dtype=np.float32)

  # Set up background audio.
  background_crossover_ms = 500
  background_segment_duration_ms = (
      FLAGS.clip_duration_ms + background_crossover_ms)
  background_segment_duration_samples = int(
      (background_segment_duration_ms * FLAGS.sample_rate) / 1000)
  background_segment_stride_samples = int(
      (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000)
  background_ramp_samples = int(
      ((background_crossover_ms / 2) * FLAGS.sample_rate) / 1000)

  # Mix the background audio into the main track.
  how_many_backgrounds = int(
      math.ceil(output_audio_sample_count / background_segment_stride_samples))
  for i in range(how_many_backgrounds):
    output_offset = int(i * background_segment_stride_samples)
    background_index = np.random.randint(len(audio_processor.background_data))
    background_samples = audio_processor.background_data[background_index]
    background_offset = np.random.randint(
        0, len(background_samples) - model_settings['desired_samples'])
    background_volume = np.random.uniform(0, FLAGS.background_volume)
    mix_in_audio_sample(output_audio, output_offset, background_samples,
                        background_offset, background_segment_duration_samples,
                        background_volume, background_ramp_samples,
                        background_ramp_samples)

  # Mix the words into the main track, noting their labels and positions.
  output_labels = []
  word_stride_ms = FLAGS.clip_duration_ms + FLAGS.word_gap_ms
  word_stride_samples = int((word_stride_ms * FLAGS.sample_rate) / 1000)
  clip_duration_samples = int(
      (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000)
  word_gap_samples = int((FLAGS.word_gap_ms * FLAGS.sample_rate) / 1000)
  how_many_words = int(
      math.floor(output_audio_sample_count / word_stride_samples))
  all_test_data, all_test_labels = audio_processor.get_unprocessed_data(
      -1, model_settings, 'testing')
  for i in range(how_many_words):
    output_offset = (
        int(i * word_stride_samples) + np.random.randint(word_gap_samples))
    output_offset_ms = (output_offset * 1000) / FLAGS.sample_rate
    is_unknown = np.random.randint(100) < FLAGS.unknown_percentage
    if is_unknown:
      wanted_label = input_data.UNKNOWN_WORD_LABEL
    else:
      wanted_label = words_list[2 + np.random.randint(len(words_list) - 2)]
    test_data_start = np.random.randint(len(all_test_data))
    found_sample_data = None
    index_lookup = np.arange(len(all_test_data), dtype=np.int32)
    np.random.shuffle(index_lookup)
    for test_data_offset in range(len(all_test_data)):
      test_data_index = index_lookup[(
          test_data_start + test_data_offset) % len(all_test_data)]
      current_label = all_test_labels[test_data_index]
      if current_label == wanted_label:
        found_sample_data = all_test_data[test_data_index]
        break
    mix_in_audio_sample(output_audio, output_offset, found_sample_data, 0,
                        clip_duration_samples, 1.0, 500, 500)
    output_labels.append({'label': wanted_label, 'time': output_offset_ms})

  input_data.save_wav_file(FLAGS.output_audio_file, output_audio,
                           FLAGS.sample_rate)
  tf.logging.info('Saved streaming test wav to %s', FLAGS.output_audio_file)

  with open(FLAGS.output_labels_file, 'w') as f:
    for output_label in output_labels:
      f.write('%s, %f\n' % (output_label['label'], output_label['time']))
  tf.logging.info('Saved streaming test labels to %s', FLAGS.output_labels_file)


if __name__ == '__main__':
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--data_url',
      type=str,
      # pylint: disable=line-too-long
      default='http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz',
      # pylint: enable=line-too-long
      help='Location of speech training data')
  parser.add_argument(
      '--data_dir',
      type=str,
      default='/tmp/speech_dataset',
      help="""\
      Where to download the speech training data to.
      """)
  parser.add_argument(
      '--background_dir',
      type=str,
      default='',
      help="""\
      Path to a directory of .wav files to mix in as background noise during training.
      """)
  parser.add_argument(
      '--background_volume',
      type=float,
      default=0.1,
      help="""\
      How loud the background noise should be, between 0 and 1.
      """)
  parser.add_argument(
      '--background_frequency',
      type=float,
      default=0.8,
      help="""\
      How many of the training samples have background noise mixed in.
      """)
  parser.add_argument(
      '--silence_percentage',
      type=float,
      default=10.0,
      help="""\
      How much of the training data should be silence.
      """)
  parser.add_argument(
      '--testing_percentage',
      type=int,
      default=10,
      help='What percentage of wavs to use as a test set.')
  parser.add_argument(
      '--validation_percentage',
      type=int,
      default=10,
      help='What percentage of wavs to use as a validation set.')
  parser.add_argument(
      '--sample_rate',
      type=int,
      default=16000,
      help='Expected sample rate of the wavs.',)
  parser.add_argument(
      '--clip_duration_ms',
      type=int,
      default=1000,
      help='Expected duration in milliseconds of the wavs.',)
  parser.add_argument(
      '--window_size_ms',
      type=float,
      default=30.0,
      help='How long each spectrogram timeslice is',)
  parser.add_argument(
      '--window_stride_ms',
      type=float,
      default=10.0,
      help='How long the stride is between spectrogram timeslices',)
  parser.add_argument(
      '--dct_coefficient_count',
      type=int,
      default=40,
      help='How many bins to use for the MFCC fingerprint',)
  parser.add_argument(
      '--wanted_words',
      type=str,
      default='yes,no,up,down,left,right,on,off,stop,go',
      help='Words to use (others will be added to an unknown label)',)
  parser.add_argument(
      '--output_audio_file',
      type=str,
      default='/tmp/speech_commands_train/streaming_test.wav',
      help='File to save the generated test audio to.')
  parser.add_argument(
      '--output_labels_file',
      type=str,
      default='/tmp/speech_commands_train/streaming_test_labels.txt',
      help='File to save the generated test labels to.')
  parser.add_argument(
      '--test_duration_seconds',
      type=int,
      default=600,
      help='How long the generated test audio file should be.',)
  parser.add_argument(
      '--word_gap_ms',
      type=int,
      default=2000,
      help='How long the average gap should be between words.',)
  parser.add_argument(
      '--unknown_percentage',
      type=int,
      default=30,
      help='What percentage of words should be unknown.')

  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)