path: root/tensorflow/examples/wav_to_spectrogram
diff options
authorGravatar Pete Warden <petewarden@google.com>2017-04-05 17:10:48 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-04-05 18:24:52 -0700
commita24c6b842d982de8a38ae5058ace91cb47ee3cef (patch)
tree6e7a909a5e8ccde22caa2d7b5f4a1a84dda3cdd2 /tensorflow/examples/wav_to_spectrogram
parent9d57702513001bfded19e72c76a986cce56d5f00 (diff)
Add AudioSpectrogram op to TensorFlow for audio feature generation
Change: 152332221
Diffstat (limited to 'tensorflow/examples/wav_to_spectrogram')
6 files changed, 348 insertions, 0 deletions
diff --git a/tensorflow/examples/wav_to_spectrogram/BUILD b/tensorflow/examples/wav_to_spectrogram/BUILD
new file mode 100644
index 0000000000..1e72324fb0
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/BUILD
@@ -0,0 +1,68 @@
+# Description:
+# TensorFlow C++ inference example for labeling images.
+ default_visibility = ["//tensorflow:internal"],
+ features = [
+ "-layering_check",
+ "-parse_headers",
+ ],
+licenses(["notice"]) # Apache 2.0
+ name = "wav_to_spectrogram_lib",
+ srcs = [
+ "wav_to_spectrogram.cc",
+ ],
+ hdrs = [
+ "wav_to_spectrogram.h",
+ ],
+ deps = [
+ "//tensorflow/cc:cc_ops",
+ "//tensorflow/core:framework_internal",
+ "//tensorflow/core:tensorflow",
+ ],
+ name = "wav_to_spectrogram",
+ srcs = [
+ "main.cc",
+ ],
+ deps = [
+ ":wav_to_spectrogram_lib",
+ "//tensorflow/core:framework_internal",
+ "//tensorflow/core:tensorflow",
+ ],
+ name = "wav_to_spectrogram_test",
+ size = "medium",
+ srcs = ["wav_to_spectrogram_test.cc"],
+ deps = [
+ ":wav_to_spectrogram_lib",
+ "//tensorflow/core:lib",
+ "//tensorflow/core:lib_internal",
+ "//tensorflow/core:test",
+ "//tensorflow/core:test_main",
+ ],
+ name = "all_files",
+ srcs = glob(
+ ["**/*"],
+ exclude = [
+ "**/METADATA",
+ "**/OWNERS",
+ "bin/**",
+ "gen/**",
+ ],
+ ),
+ visibility = ["//tensorflow:__subpackages__"],
diff --git a/tensorflow/examples/wav_to_spectrogram/README.md b/tensorflow/examples/wav_to_spectrogram/README.md
new file mode 100644
index 0000000000..7f7eb43700
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/README.md
@@ -0,0 +1,49 @@
+# TensorFlow Spectrogram Example
+This example shows how you can load audio from a .wav file, convert it to a
+spectrogram, and then save it out as a PNG image. A spectrogram is a
+visualization of the frequencies in sound over time, and can be useful as a
+feature for neural network recognition on noise or speech.
+## Building
+To build it, run this command:
+bazel build tensorflow/examples/wav_to_spectrogram/...
+That should build a binary executable that you can then run like this:
+This uses a default test audio file that's part of the TensorFlow source code,
+and writes out the image to the current directory as spectrogram.png.
+## Options
+To load your own audio, you need to supply a .wav file in LIN16 format, and use
+the `--input_audio` flag to pass in the path.
+To control how the spectrogram is created, you can specify the `--window_size`
+and `--stride` arguments, which control how wide the window used to estimate
+frequencies is, and how widely adjacent windows are spaced.
+The `--output_image` flag sets the path to save the image file to. This is
+always written out in PNG format, even if you specify a different file
+If your result seems too dark, try using the `--brightness` flag to make the
+output image easier to see.
+Here's an example of how to use all of them together:
+bazel-bin/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram \
+--input_wav=/tmp/my_audio.wav \
+--window=1024 \
+--stride=512 \
diff --git a/tensorflow/examples/wav_to_spectrogram/main.cc b/tensorflow/examples/wav_to_spectrogram/main.cc
new file mode 100644
index 0000000000..539e6c4fe4
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/main.cc
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+See the License for the specific language governing permissions and
+limitations under the License.
+#include "tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+int main(int argc, char* argv[]) {
+ // These are the command-line flags the program can understand.
+ // They define where the graph and input data is located, and what kind of
+ // input the model expects. If you train your own model, or use something
+ // other than inception_v3, then you'll need to update these.
+ tensorflow::string input_wav =
+ "tensorflow/core/kernels/spectrogram_test_data/short_test_segment.wav";
+ tensorflow::int32 window_size = 256;
+ tensorflow::int32 stride = 128;
+ float brightness = 64.0f;
+ tensorflow::string output_image = "spectrogram.png";
+ std::vector<tensorflow::Flag> flag_list = {
+ tensorflow::Flag("input_wav", &input_wav, "audio file to load"),
+ tensorflow::Flag("window_size", &window_size,
+ "frequency sample window width"),
+ tensorflow::Flag("stride", &stride,
+ "how far apart to place frequency windows"),
+ tensorflow::Flag("brightness", &brightness,
+ "controls how bright the output image is"),
+ tensorflow::Flag("output_image", &output_image,
+ "where to save the spectrogram image to"),
+ };
+ tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+ const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+ if (!parse_result) {
+ LOG(ERROR) << usage;
+ return -1;
+ }
+ // We need to call this to set up global state for TensorFlow.
+ tensorflow::port::InitMain(argv[0], &argc, &argv);
+ if (argc > 1) {
+ LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
+ return -1;
+ }
+ tensorflow::Status wav_status = WavToSpectrogram(
+ input_wav, window_size, stride, brightness, output_image);
+ if (!wav_status.ok()) {
+ LOG(ERROR) << "WavToSpectrogram failed with " << wav_status;
+ return -1;
+ }
+ return 0;
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
new file mode 100644
index 0000000000..c69a359637
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
@@ -0,0 +1,97 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+See the License for the specific language governing permissions and
+limitations under the License.
+#include "tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h"
+#include <vector>
+#include "tensorflow/cc/ops/audio_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/default_device.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/command_line_flags.h"
+using tensorflow::DT_FLOAT;
+using tensorflow::DT_UINT8;
+using tensorflow::Output;
+using tensorflow::TensorShape;
+// Runs a TensorFlow graph to convert an audio file into a visualization.
+tensorflow::Status WavToSpectrogram(const tensorflow::string& input_wav,
+ tensorflow::int32 window_size,
+ tensorflow::int32 stride, float brightness,
+ const tensorflow::string& output_image) {
+ auto root = tensorflow::Scope::NewRootScope();
+ using namespace tensorflow::ops; // NOLINT(build/namespaces)
+ // The following block creates a TensorFlow graph that:
+ // - Reads and decodes the audio file into a tensor of float samples.
+ // - Creates a float spectrogram from those samples.
+ // - Scales, clamps, and converts that spectrogram to 0 to 255 uint8's.
+ // - Reshapes the tensor so that it's [height, width, 1] for imaging.
+ // - Encodes it as a PNG stream and saves it out to a file.
+ Output file_reader = ReadFile(root.WithOpName("input_wav"), input_wav);
+ DecodeWav wav_decoder =
+ DecodeWav(root.WithOpName("wav_decoder"), file_reader);
+ Output spectrogram = AudioSpectrogram(root.WithOpName("spectrogram"),
+ wav_decoder.audio, window_size, stride);
+ Output brightness_placeholder =
+ Placeholder(root.WithOpName("brightness_placeholder"), DT_FLOAT,
+ Placeholder::Attrs().Shape(TensorShape({})));
+ Output mul = Mul(root.WithOpName("mul"), spectrogram, brightness_placeholder);
+ Output min_const = Const(root.WithOpName("min_const"), 255.0f);
+ Output min = Minimum(root.WithOpName("min"), mul, min_const);
+ Output cast = Cast(root.WithOpName("cast"), min, DT_UINT8);
+ Output expand_dims_const = Const(root.WithOpName("expand_dims_const"), -1);
+ Output expand_dims =
+ ExpandDims(root.WithOpName("expand_dims"), cast, expand_dims_const);
+ Output squeeze = Squeeze(root.WithOpName("squeeze"), expand_dims,
+ Squeeze::Attrs().SqueezeDims({0}));
+ Output png_encoder = EncodePng(root.WithOpName("png_encoder"), squeeze);
+ WriteFile file_writer =
+ WriteFile(root.WithOpName("output_image"), output_image, png_encoder);
+ tensorflow::GraphDef graph;
+ TF_RETURN_IF_ERROR(root.ToGraphDef(&graph));
+ // Build a session object from this graph definition. The power of TensorFlow
+ // is that you can reuse complex computations like this, so usually we'd run a
+ // lot of different inputs through it. In this example, we're just doing a
+ // one-off run, so we'll create it and then use it immediately.
+ std::unique_ptr<tensorflow::Session> session(
+ tensorflow::NewSession(tensorflow::SessionOptions()));
+ TF_RETURN_IF_ERROR(session->Create(graph));
+ // We're passing in the brightness as an input, so create a tensor to hold the
+ // value.
+ tensorflow::Tensor brightness_tensor(DT_FLOAT, TensorShape({}));
+ brightness_tensor.scalar<float>()() = brightness;
+ // Run the session to analyze the audio and write out the file.
+ session->Run({{"brightness_placeholder", brightness_tensor}}, {},
+ {"output_image"}, nullptr));
+ return tensorflow::Status::OK();
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
new file mode 100644
index 0000000000..fa8cb0abe9
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+See the License for the specific language governing permissions and
+limitations under the License.
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+// Runs a TensorFlow graph to convert an audio file into a visualization. Takes
+// in the path to the audio file, the window size and stride parameters
+// controlling the spectrogram creation, the brightness scaling to use, and a
+// path to save the output PNG file to.
+tensorflow::Status WavToSpectrogram(const tensorflow::string& input_wav,
+ tensorflow::int32 window_size,
+ tensorflow::int32 stride, float brightness,
+ const tensorflow::string& output_image);
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc
new file mode 100644
index 0000000000..e599711445
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+See the License for the specific language governing permissions and
+limitations under the License.
+#include "tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/wav/wav_io.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+TEST(WavToSpectrogramTest, WavToSpectrogramTest) {
+ const tensorflow::string input_wav =
+ tensorflow::io::JoinPath(tensorflow::testing::TmpDir(), "input_wav.wav");
+ const tensorflow::string output_image = tensorflow::io::JoinPath(
+ tensorflow::testing::TmpDir(), "output_image.png");
+ float audio[8] = {-1.0f, 0.0f, 1.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
+ tensorflow::string wav_string;
+ tensorflow::wav::EncodeAudioAsS16LEWav(audio, 44100, 1, 8, &wav_string));
+ TF_ASSERT_OK(tensorflow::WriteStringToFile(tensorflow::Env::Default(),
+ input_wav, wav_string));
+ TF_ASSERT_OK(WavToSpectrogram(input_wav, 4, 4, 64.0f, output_image));
+ TF_EXPECT_OK(tensorflow::Env::Default()->FileExists(output_image));