Add AudioSpectrogram op to TensorFlow for audio feature generation

Change: 152332221
author: Pete Warden <petewarden@google.com> 2017-04-05 17:10:48 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-04-05 18:24:52 -0700
commit: a24c6b842d982de8a38ae5058ace91cb47ee3cef (patch)
tree: 6e7a909a5e8ccde22caa2d7b5f4a1a84dda3cdd2 /tensorflow/examples/wav_to_spectrogram
parent: 9d57702513001bfded19e72c76a986cce56d5f00 (diff)
6 files changed, 348 insertions, 0 deletions
diff --git a/tensorflow/examples/wav_to_spectrogram/BUILD b/tensorflow/examples/wav_to_spectrogram/BUILD
new file mode 100644
index 0000000000..1e72324fb0
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/BUILD
@@ -0,0 +1,68 @@
+# Description:
+#   TensorFlow C++ inference example for labeling images.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    features = [
+        "-layering_check",
+        "-parse_headers",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "wav_to_spectrogram_lib",
+    srcs = [
+        "wav_to_spectrogram.cc",
+    ],
+    hdrs = [
+        "wav_to_spectrogram.h",
+    ],
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:tensorflow",
+    ],
+)
+
+cc_binary(
+    name = "wav_to_spectrogram",
+    srcs = [
+        "main.cc",
+    ],
+    deps = [
+        ":wav_to_spectrogram_lib",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:tensorflow",
+    ],
+)
+
+cc_test(
+    name = "wav_to_spectrogram_test",
+    size = "medium",
+    srcs = ["wav_to_spectrogram_test.cc"],
+    deps = [
+        ":wav_to_spectrogram_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+            "bin/**",
+            "gen/**",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/examples/wav_to_spectrogram/README.md b/tensorflow/examples/wav_to_spectrogram/README.md
new file mode 100644
index 0000000000..7f7eb43700
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/README.md
@@ -0,0 +1,49 @@
+# TensorFlow Spectrogram Example
+
+This example shows how you can load audio from a .wav file, convert it to a
+spectrogram, and then save it out as a PNG image. A spectrogram is a
+visualization of the frequencies in sound over time, and can be useful as a
+feature for neural network recognition on noise or speech.
+
+## Building
+
+To build it, run this command:
+
+```bash
+bazel build tensorflow/examples/wav_to_spectrogram/...
+```
+
+That should build a binary executable that you can then run like this:
+
+```bash
+bazel-bin/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram
+```
+
+This uses a default test audio file that's part of the TensorFlow source code,
+and writes out the image to the current directory as spectrogram.png.
+
+## Options
+
+To load your own audio, you need to supply a .wav file in LIN16 format, and use
+the `--input_audio` flag to pass in the path.
+
+To control how the spectrogram is created, you can specify the `--window_size`
+and `--stride` arguments, which control how wide the window used to estimate
+frequencies is, and how widely adjacent windows are spaced.
+
+The `--output_image` flag sets the path to save the image file to. This is
+always written out in PNG format, even if you specify a different file
+extension.
+
+If your result seems too dark, try using the `--brightness` flag to make the
+output image easier to see.
+
+Here's an example of how to use all of them together:
+
+```bash
+bazel-bin/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram \
+--input_wav=/tmp/my_audio.wav \
+--window=1024 \
+--stride=512 \
+--output_image=/tmp/my_spectrogram.png
+```
diff --git a/tensorflow/examples/wav_to_spectrogram/main.cc b/tensorflow/examples/wav_to_spectrogram/main.cc
new file mode 100644
index 0000000000..539e6c4fe4
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/main.cc
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+int main(int argc, char* argv[]) {
+  // These are the command-line flags the program can understand.
+  // They define where the graph and input data is located, and what kind of
+  // input the model expects. If you train your own model, or use something
+  // other than inception_v3, then you'll need to update these.
+  tensorflow::string input_wav =
+      "tensorflow/core/kernels/spectrogram_test_data/short_test_segment.wav";
+  tensorflow::int32 window_size = 256;
+  tensorflow::int32 stride = 128;
+  float brightness = 64.0f;
+  tensorflow::string output_image = "spectrogram.png";
+  std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("input_wav", &input_wav, "audio file to load"),
+      tensorflow::Flag("window_size", &window_size,
+                       "frequency sample window width"),
+      tensorflow::Flag("stride", &stride,
+                       "how far apart to place frequency windows"),
+      tensorflow::Flag("brightness", &brightness,
+                       "controls how bright the output image is"),
+      tensorflow::Flag("output_image", &output_image,
+                       "where to save the spectrogram image to"),
+  };
+  tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    LOG(ERROR) << usage;
+    return -1;
+  }
+
+  // We need to call this to set up global state for TensorFlow.
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+  if (argc > 1) {
+    LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
+    return -1;
+  }
+
+  tensorflow::Status wav_status = WavToSpectrogram(
+      input_wav, window_size, stride, brightness, output_image);
+  if (!wav_status.ok()) {
+    LOG(ERROR) << "WavToSpectrogram failed with " << wav_status;
+    return -1;
+  }
+
+  return 0;
+}
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
new file mode 100644
index 0000000000..c69a359637
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
@@ -0,0 +1,97 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h"
+
+#include <vector>
+
+#include "tensorflow/cc/ops/audio_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/default_device.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+using tensorflow::DT_FLOAT;
+using tensorflow::DT_UINT8;
+using tensorflow::Output;
+using tensorflow::TensorShape;
+
+// Runs a TensorFlow graph to convert an audio file into a visualization.
+tensorflow::Status WavToSpectrogram(const tensorflow::string& input_wav,
+                                    tensorflow::int32 window_size,
+                                    tensorflow::int32 stride, float brightness,
+                                    const tensorflow::string& output_image) {
+  auto root = tensorflow::Scope::NewRootScope();
+  using namespace tensorflow::ops;  // NOLINT(build/namespaces)
+  // The following block creates a TensorFlow graph that:
+  //  - Reads and decodes the audio file into a tensor of float samples.
+  //  - Creates a float spectrogram from those samples.
+  //  - Scales, clamps, and converts that spectrogram to 0 to 255 uint8's.
+  //  - Reshapes the tensor so that it's [height, width, 1] for imaging.
+  //  - Encodes it as a PNG stream and saves it out to a file.
+  Output file_reader = ReadFile(root.WithOpName("input_wav"), input_wav);
+  DecodeWav wav_decoder =
+      DecodeWav(root.WithOpName("wav_decoder"), file_reader);
+  Output spectrogram = AudioSpectrogram(root.WithOpName("spectrogram"),
+                                        wav_decoder.audio, window_size, stride);
+  Output brightness_placeholder =
+      Placeholder(root.WithOpName("brightness_placeholder"), DT_FLOAT,
+                  Placeholder::Attrs().Shape(TensorShape({})));
+  Output mul = Mul(root.WithOpName("mul"), spectrogram, brightness_placeholder);
+  Output min_const = Const(root.WithOpName("min_const"), 255.0f);
+  Output min = Minimum(root.WithOpName("min"), mul, min_const);
+  Output cast = Cast(root.WithOpName("cast"), min, DT_UINT8);
+  Output expand_dims_const = Const(root.WithOpName("expand_dims_const"), -1);
+  Output expand_dims =
+      ExpandDims(root.WithOpName("expand_dims"), cast, expand_dims_const);
+  Output squeeze = Squeeze(root.WithOpName("squeeze"), expand_dims,
+                           Squeeze::Attrs().SqueezeDims({0}));
+  Output png_encoder = EncodePng(root.WithOpName("png_encoder"), squeeze);
+  WriteFile file_writer =
+      WriteFile(root.WithOpName("output_image"), output_image, png_encoder);
+  tensorflow::GraphDef graph;
+  TF_RETURN_IF_ERROR(root.ToGraphDef(&graph));
+
+  // Build a session object from this graph definition. The power of TensorFlow
+  // is that you can reuse complex computations like this, so usually we'd run a
+  // lot of different inputs through it. In this example, we're just doing a
+  // one-off run, so we'll create it and then use it immediately.
+  std::unique_ptr<tensorflow::Session> session(
+      tensorflow::NewSession(tensorflow::SessionOptions()));
+  TF_RETURN_IF_ERROR(session->Create(graph));
+
+  // We're passing in the brightness as an input, so create a tensor to hold the
+  // value.
+  tensorflow::Tensor brightness_tensor(DT_FLOAT, TensorShape({}));
+  brightness_tensor.scalar<float>()() = brightness;
+
+  // Run the session to analyze the audio and write out the file.
+  TF_RETURN_IF_ERROR(
+      session->Run({{"brightness_placeholder", brightness_tensor}}, {},
+                   {"output_image"}, nullptr));
+  return tensorflow::Status::OK();
+}
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
new file mode 100644
index 0000000000..fa8cb0abe9
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
+#define THIRD_PARTY_TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+// Runs a TensorFlow graph to convert an audio file into a visualization. Takes
+// in the path to the audio file, the window size and stride parameters
+// controlling the spectrogram creation, the brightness scaling to use, and a
+// path to save the output PNG file to.
+tensorflow::Status WavToSpectrogram(const tensorflow::string& input_wav,
+                                    tensorflow::int32 window_size,
+                                    tensorflow::int32 stride, float brightness,
+                                    const tensorflow::string& output_image);
+
+#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc
new file mode 100644
index 0000000000..e599711445
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/wav/wav_io.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+
+TEST(WavToSpectrogramTest, WavToSpectrogramTest) {
+  const tensorflow::string input_wav =
+      tensorflow::io::JoinPath(tensorflow::testing::TmpDir(), "input_wav.wav");
+  const tensorflow::string output_image = tensorflow::io::JoinPath(
+      tensorflow::testing::TmpDir(), "output_image.png");
+  float audio[8] = {-1.0f, 0.0f, 1.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
+  tensorflow::string wav_string;
+  TF_ASSERT_OK(
+      tensorflow::wav::EncodeAudioAsS16LEWav(audio, 44100, 1, 8, &wav_string));
+  TF_ASSERT_OK(tensorflow::WriteStringToFile(tensorflow::Env::Default(),
+                                             input_wav, wav_string));
+  TF_ASSERT_OK(WavToSpectrogram(input_wav, 4, 4, 64.0f, output_image));
+  TF_EXPECT_OK(tensorflow::Env::Default()->FileExists(output_image));
+}
author	Pete Warden <petewarden@google.com>	2017-04-05 17:10:48 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-04-05 18:24:52 -0700
commit	a24c6b842d982de8a38ae5058ace91cb47ee3cef (patch)
tree	6e7a909a5e8ccde22caa2d7b5f4a1a84dda3cdd2 /tensorflow/examples/wav_to_spectrogram
parent	9d57702513001bfded19e72c76a986cce56d5f00 (diff)