aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Pete Warden <petewarden@google.com>2017-04-05 17:10:48 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-04-05 18:24:52 -0700
commita24c6b842d982de8a38ae5058ace91cb47ee3cef (patch)
tree6e7a909a5e8ccde22caa2d7b5f4a1a84dda3cdd2
parent9d57702513001bfded19e72c76a986cce56d5f00 (diff)
Add AudioSpectrogram op to TensorFlow for audio feature generation
Change: 152332221
-rw-r--r--tensorflow/BUILD1
-rw-r--r--tensorflow/contrib/cmake/CMakeLists.txt3
-rw-r--r--tensorflow/contrib/cmake/external/fft2d.cmake37
-rw-r--r--tensorflow/contrib/cmake/patches/fft2d/CMakeLists.txt17
-rw-r--r--tensorflow/core/BUILD9
-rw-r--r--tensorflow/core/kernels/BUILD111
-rw-r--r--tensorflow/core/kernels/spectrogram.cc212
-rw-r--r--tensorflow/core/kernels/spectrogram.h112
-rw-r--r--tensorflow/core/kernels/spectrogram_convert_test_data.cc56
-rw-r--r--tensorflow/core/kernels/spectrogram_op.cc120
-rw-r--r--tensorflow/core/kernels/spectrogram_op_test.cc104
-rw-r--r--tensorflow/core/kernels/spectrogram_test.cc340
-rw-r--r--tensorflow/core/kernels/spectrogram_test_data/README8
-rw-r--r--tensorflow/core/kernels/spectrogram_test_data/short_test_segment.wavbin0 -> 91784 bytes
-rw-r--r--tensorflow/core/kernels/spectrogram_test_data/short_test_segment_spectrogram.csv.binbin0 -> 365968 bytes
-rw-r--r--tensorflow/core/kernels/spectrogram_test_data/short_test_segment_spectrogram_400_200.csv.binbin0 -> 468768 bytes
-rw-r--r--tensorflow/core/kernels/spectrogram_test_utils.cc288
-rw-r--r--tensorflow/core/kernels/spectrogram_test_utils.h81
-rw-r--r--tensorflow/core/lib/core/bits.h13
-rw-r--r--tensorflow/core/ops/audio_ops.cc79
-rw-r--r--tensorflow/core/platform/default/build_config/BUILD1
-rw-r--r--tensorflow/core/util/command_line_flags.cc28
-rw-r--r--tensorflow/core/util/command_line_flags.h4
-rw-r--r--tensorflow/core/util/command_line_flags_test.cc35
-rw-r--r--tensorflow/examples/wav_to_spectrogram/BUILD68
-rw-r--r--tensorflow/examples/wav_to_spectrogram/README.md49
-rw-r--r--tensorflow/examples/wav_to_spectrogram/main.cc66
-rw-r--r--tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc97
-rw-r--r--tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h31
-rw-r--r--tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc37
-rw-r--r--tensorflow/tools/lib_package/BUILD4
-rw-r--r--tensorflow/tools/pip_package/BUILD2
-rw-r--r--tensorflow/workspace.bzl10
-rw-r--r--third_party/fft2d/BUILD30
-rw-r--r--third_party/fft2d/LICENSE3
-rw-r--r--third_party/fft2d/fft.h36
-rw-r--r--third_party/fft2d/fft2d.BUILD36
37 files changed, 2120 insertions, 8 deletions
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 37dc8e265f..5d2b1e74df 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -276,6 +276,7 @@ filegroup(
"//tensorflow/examples/tutorials/estimators:all_files",
"//tensorflow/examples/tutorials/mnist:all_files",
"//tensorflow/examples/tutorials/word2vec:all_files",
+ "//tensorflow/examples/wav_to_spectrogram:all_files",
"//tensorflow/go:all_files",
"//tensorflow/java:all_files",
"//tensorflow/java/src/main/java/org/tensorflow/examples:all_files",
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index e27df6898e..31a3d45a98 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -108,6 +108,7 @@ include(eigen)
include(gemmlowp)
include(jsoncpp)
include(farmhash)
+include(fft2d)
include(highwayhash)
include(protobuf)
if (tensorflow_BUILD_CC_TESTS)
@@ -121,6 +122,7 @@ set(tensorflow_EXTERNAL_LIBRARIES
${jpeg_STATIC_LIBRARIES}
${jsoncpp_STATIC_LIBRARIES}
${farmhash_STATIC_LIBRARIES}
+ ${fft2d_STATIC_LIBRARIES}
${highwayhash_STATIC_LIBRARIES}
${protobuf_STATIC_LIBRARIES}
)
@@ -135,6 +137,7 @@ set(tensorflow_EXTERNAL_DEPENDENCIES
protobuf
eigen
gemmlowp
+ fft2d
)
include_directories(
diff --git a/tensorflow/contrib/cmake/external/fft2d.cmake b/tensorflow/contrib/cmake/external/fft2d.cmake
new file mode 100644
index 0000000000..50c6b91684
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/fft2d.cmake
@@ -0,0 +1,37 @@
+include (ExternalProject)
+
+set(fft2d_URL http://www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz)
+set(fft2d_HASH SHA256=52bb637c70b971958ec79c9c8752b1df5ff0218a4db4510e60826e0cb79b5296)
+set(fft2d_BUILD ${CMAKE_CURRENT_BINARY_DIR}/fft2d/)
+set(fft2d_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/fft2d/src)
+
+if(WIN32)
+ set(fft2d_STATIC_LIBRARIES ${fft2d_BUILD}/src/lib/fft2d.lib)
+
+ ExternalProject_Add(fft2d
+ PREFIX fft2d
+ URL ${fft2d_URL}
+ URL_HASH ${fft2d_HASH}
+ DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+ BUILD_IN_SOURCE 1
+ PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/fft2d/CMakeLists.txt ${fft2d_BUILD}/src/fft2d/CMakeLists.txt
+ INSTALL_DIR ${fft2d_INSTALL}
+ CMAKE_CACHE_ARGS
+ -DCMAKE_BUILD_TYPE:STRING=Release
+ -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+ -DCMAKE_INSTALL_PREFIX:STRING=${fft2d_INSTALL})
+else()
+ set(fft2d_STATIC_LIBRARIES ${fft2d_BUILD}/src/fft2d/libfft2d.a)
+
+ ExternalProject_Add(fft2d
+ PREFIX fft2d
+ URL ${fft2d_URL}
+ URL_HASH ${fft2d_HASH}
+ DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+ BUILD_IN_SOURCE 1
+ PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/fft2d/CMakeLists.txt ${fft2d_BUILD}/src/fft2d/CMakeLists.txt
+ INSTALL_DIR $(fft2d_INSTALL)
+ INSTALL_COMMAND echo
+ BUILD_COMMAND $(MAKE))
+
+endif()
diff --git a/tensorflow/contrib/cmake/patches/fft2d/CMakeLists.txt b/tensorflow/contrib/cmake/patches/fft2d/CMakeLists.txt
new file mode 100644
index 0000000000..b31ea3ed98
--- /dev/null
+++ b/tensorflow/contrib/cmake/patches/fft2d/CMakeLists.txt
@@ -0,0 +1,17 @@
+cmake_minimum_required(VERSION 2.8.3)
+
+project(fft2d)
+
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(FFT2D_SRCS
+ "fftsg.c"
+)
+
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
+
+add_library(fft2d ${FFT2D_SRCS})
+
+install(TARGETS fft2d
+ LIBRARY DESTINATION lib COMPONENT RuntimeLibraries
+ ARCHIVE DESTINATION lib COMPONENT Development)
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index ba761cd7c6..6b7e297c4f 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -494,7 +494,6 @@ cc_library(
tf_gen_op_libs(
op_lib_names = [
"array_ops",
- "audio_ops",
"candidate_sampling_ops",
"control_flow_ops",
"ctc_ops",
@@ -524,6 +523,13 @@ tf_gen_op_libs(
],
)
+tf_gen_op_libs(
+ op_lib_names = [
+ "audio_ops",
+ ],
+ deps = [":lib"],
+)
+
cc_library(
name = "debug_ops_op_lib",
srcs = ["ops/debug_ops.cc"],
@@ -686,6 +692,7 @@ cc_library(
visibility = ["//visibility:public"],
deps = [
"//tensorflow/core/kernels:array",
+ "//tensorflow/core/kernels:audio",
"//tensorflow/core/kernels:bincount_op",
"//tensorflow/core/kernels:candidate_sampler_ops",
"//tensorflow/core/kernels:control_flow_ops",
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 9c47d520d9..0a4fd0f256 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -3537,6 +3537,117 @@ tf_kernel_library(
],
)
+filegroup(
+ name = "spectrogram_test_data",
+ srcs = [
+ "spectrogram_test_data/short_test_segment.wav",
+ "spectrogram_test_data/short_test_segment_spectrogram.csv.bin",
+ "spectrogram_test_data/short_test_segment_spectrogram_400_200.csv.bin",
+ ],
+ visibility = ["//visibility:public"],
+)
+
+cc_library(
+ name = "spectrogram",
+ srcs = ["spectrogram.cc"],
+ hdrs = ["spectrogram.h"],
+ copts = tf_copts(),
+ deps = [
+ "//tensorflow/core:framework",
+ "//tensorflow/core:lib",
+ "//third_party/fft2d:fft2d_headers",
+ "@fft2d//:fft2d",
+ ],
+)
+
+cc_library(
+ name = "spectrogram_test_utils",
+ testonly = 1,
+ srcs = ["spectrogram_test_utils.cc"],
+ hdrs = ["spectrogram_test_utils.h"],
+ copts = tf_copts(),
+ deps = [
+ "//tensorflow/core:framework",
+ "//tensorflow/core:lib",
+ "//tensorflow/core:lib_internal",
+ "//tensorflow/core:protos_all_cc",
+ "//tensorflow/core:test",
+ ],
+)
+
+cc_binary(
+ name = "spectrogram_convert_test_data",
+ testonly = 1,
+ srcs = ["spectrogram_convert_test_data.cc"],
+ deps = [
+ ":spectrogram_test_utils",
+ "//tensorflow/core:lib",
+ "//tensorflow/core:lib_internal",
+ ],
+)
+
+tf_cc_test(
+ name = "spectrogram_test",
+ size = "medium",
+ srcs = ["spectrogram_test.cc"],
+ data = [":spectrogram_test_data"],
+ deps = [
+ ":spectrogram",
+ ":spectrogram_test_utils",
+ "//tensorflow/core:lib",
+ "//tensorflow/core:lib_internal",
+ "//tensorflow/core:lib_test_internal",
+ "//tensorflow/core:protos_all_cc",
+ "//tensorflow/core:test",
+ "//tensorflow/core:test_main",
+ "//third_party/eigen3",
+ ],
+)
+
+tf_kernel_library(
+ name = "spectrogram_op",
+ prefix = "spectrogram_op",
+ deps = [
+ ":spectrogram",
+ "//tensorflow/core:audio_ops_op_lib",
+ "//tensorflow/core:core_cpu",
+ "//tensorflow/core:framework",
+ "//tensorflow/core:lib",
+ "//tensorflow/core:lib_internal",
+ ],
+ alwayslink = 1,
+)
+
+tf_cuda_cc_test(
+ name = "spectrogram_op_test",
+ size = "small",
+ srcs = ["spectrogram_op_test.cc"],
+ deps = [
+ ":ops_util",
+ ":spectrogram_op",
+ "//tensorflow/cc:cc_ops",
+ "//tensorflow/cc:client_session",
+ "//tensorflow/core:core_cpu",
+ "//tensorflow/core:framework",
+ "//tensorflow/core:framework_internal",
+ "//tensorflow/core:lib",
+ "//tensorflow/core:protos_all_cc",
+ "//tensorflow/core:tensorflow",
+ "//tensorflow/core:test",
+ "//tensorflow/core:test_main",
+ "//tensorflow/core:testlib",
+ ],
+)
+
+cc_library(
+ name = "audio",
+ deps = [
+ ":decode_wav_op",
+ ":encode_wav_op",
+ ":spectrogram_op",
+ ],
+)
+
# Android libraries -----------------------------------------------------------
# Changes to the Android srcs here should be replicated in
diff --git a/tensorflow/core/kernels/spectrogram.cc b/tensorflow/core/kernels/spectrogram.cc
new file mode 100644
index 0000000000..7531d5d64a
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram.cc
@@ -0,0 +1,212 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/spectrogram.h"
+
+#include <math.h>
+
+#include "third_party/fft2d/fft.h"
+#include "tensorflow/core/lib/core/bits.h"
+
+namespace tensorflow {
+
+using std::complex;
+
+namespace {
+// Returns the default Hann window function for the spectrogram.
+void GetPeriodicHann(int window_length, std::vector<double>* window) {
+ // Some platforms don't have M_PI, so define a local constant here.
+ const double pi = std::atan(1) * 4;
+ window->resize(window_length);
+ for (int i = 0; i < window_length; ++i) {
+ (*window)[i] = 0.5 - 0.5 * cos((2 * pi * i) / window_length);
+ }
+}
+} // namespace
+
+bool Spectrogram::Initialize(int window_length, int step_length) {
+ std::vector<double> window;
+ GetPeriodicHann(window_length, &window);
+ return Initialize(window, step_length);
+}
+
+bool Spectrogram::Initialize(const std::vector<double>& window,
+ int step_length) {
+ window_length_ = window.size();
+ window_ = window; // Copy window.
+ if (window_length_ < 2) {
+ LOG(ERROR) << "Window length too short.";
+ initialized_ = false;
+ return false;
+ }
+
+ step_length_ = step_length;
+ if (step_length_ < 1) {
+ LOG(ERROR) << "Step length must be positive.";
+ initialized_ = false;
+ return false;
+ }
+
+ fft_length_ = NextPowerOfTwo(window_length_);
+ CHECK(fft_length_ >= window_length_);
+ output_frequency_channels_ = 1 + fft_length_ / 2;
+
+ // Allocate 2 more than what rdft needs, so we can rationalize the layout.
+ fft_input_output_.assign(fft_length_ + 2, 0.0);
+
+ int half_fft_length = fft_length_ / 2;
+ fft_double_working_area_.assign(half_fft_length, 0.0);
+ fft_integer_working_area_.assign(2 + static_cast<int>(sqrt(half_fft_length)),
+ 0);
+ // Set flag element to ensure that the working areas are initialized
+ // on the first call to cdft. It's redundant given the assign above,
+ // but keep it as a reminder.
+ fft_integer_working_area_[0] = 0;
+ input_queue_.clear();
+ samples_to_next_step_ = window_length_;
+ initialized_ = true;
+ return true;
+}
+
+template <class InputSample, class OutputSample>
+bool Spectrogram::ComputeComplexSpectrogram(
+ const std::vector<InputSample>& input,
+ std::vector<std::vector<complex<OutputSample>>>* output) {
+ if (!initialized_) {
+ LOG(ERROR) << "ComputeComplexSpectrogram() called before successful call "
+ << "to Initialize().";
+ return false;
+ }
+ CHECK(output);
+ output->clear();
+ int input_start = 0;
+ while (GetNextWindowOfSamples(input, &input_start)) {
+ DCHECK_EQ(input_queue_.size(), window_length_);
+ ProcessCoreFFT(); // Processes input_queue_ to fft_input_output_.
+ // Add a new slice vector onto the output, to save new result to.
+ output->resize(output->size() + 1);
+ // Get a reference to the newly added slice to fill in.
+ auto& spectrogram_slice = output->back();
+ spectrogram_slice.resize(output_frequency_channels_);
+ for (int i = 0; i < output_frequency_channels_; ++i) {
+ // This will convert double to float if it needs to.
+ spectrogram_slice[i] = complex<OutputSample>(
+ fft_input_output_[2 * i], fft_input_output_[2 * i + 1]);
+ }
+ }
+ return true;
+}
+// Instantiate it four ways:
+template bool Spectrogram::ComputeComplexSpectrogram(
+ const std::vector<float>& input, std::vector<std::vector<complex<float>>>*);
+template bool Spectrogram::ComputeComplexSpectrogram(
+ const std::vector<double>& input,
+ std::vector<std::vector<complex<float>>>*);
+template bool Spectrogram::ComputeComplexSpectrogram(
+ const std::vector<float>& input,
+ std::vector<std::vector<complex<double>>>*);
+template bool Spectrogram::ComputeComplexSpectrogram(
+ const std::vector<double>& input,
+ std::vector<std::vector<complex<double>>>*);
+
+template <class InputSample, class OutputSample>
+bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+ const std::vector<InputSample>& input,
+ std::vector<std::vector<OutputSample>>* output) {
+ if (!initialized_) {
+ LOG(ERROR) << "ComputeSquaredMagnitudeSpectrogram() called before "
+ << "successful call to Initialize().";
+ return false;
+ }
+ CHECK(output);
+ output->clear();
+ int input_start = 0;
+ while (GetNextWindowOfSamples(input, &input_start)) {
+ DCHECK_EQ(input_queue_.size(), window_length_);
+ ProcessCoreFFT(); // Processes input_queue_ to fft_input_output_.
+ // Add a new slice vector onto the output, to save new result to.
+ output->resize(output->size() + 1);
+ // Get a reference to the newly added slice to fill in.
+ auto& spectrogram_slice = output->back();
+ spectrogram_slice.resize(output_frequency_channels_);
+ for (int i = 0; i < output_frequency_channels_; ++i) {
+ // Similar to the Complex case, except storing the norm.
+ // But the norm function is known to be a performance killer,
+ // so do it this way with explicit real and imagninary temps.
+ const double re = fft_input_output_[2 * i];
+ const double im = fft_input_output_[2 * i + 1];
+ // Which finally converts double to float if it needs to.
+ spectrogram_slice[i] = re * re + im * im;
+ }
+ }
+ return true;
+}
+// Instantiate it four ways:
+template bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+ const std::vector<float>& input, std::vector<std::vector<float>>*);
+template bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+ const std::vector<double>& input, std::vector<std::vector<float>>*);
+template bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+ const std::vector<float>& input, std::vector<std::vector<double>>*);
+template bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+ const std::vector<double>& input, std::vector<std::vector<double>>*);
+
+// Return true if a full window of samples is prepared; manage the queue.
+template <class InputSample>
+bool Spectrogram::GetNextWindowOfSamples(const std::vector<InputSample>& input,
+ int* input_start) {
+ auto input_it = input.begin() + *input_start;
+ int input_remaining = input.end() - input_it;
+ if (samples_to_next_step_ > input_remaining) {
+ // Copy in as many samples are left and return false, no full window.
+ input_queue_.insert(input_queue_.end(), input_it, input.end());
+ *input_start += input_remaining; // Increases it to input.size().
+ samples_to_next_step_ -= input_remaining;
+ return false; // Not enough for a full window.
+ } else {
+ // Copy just enough into queue to make a new window, then trim the
+ // front off the queue to make it window-sized.
+ input_queue_.insert(input_queue_.end(), input_it,
+ input_it + samples_to_next_step_);
+ *input_start += samples_to_next_step_;
+ input_queue_.erase(
+ input_queue_.begin(),
+ input_queue_.begin() + input_queue_.size() - window_length_);
+ DCHECK_EQ(window_length_, input_queue_.size());
+ samples_to_next_step_ = step_length_; // Be ready for next time.
+ return true; // Yes, input_queue_ now contains exactly a window-full.
+ }
+}
+
+void Spectrogram::ProcessCoreFFT() {
+ for (int j = 0; j < window_length_; ++j) {
+ fft_input_output_[j] = input_queue_[j] * window_[j];
+ }
+ // Zero-pad the rest of the input buffer.
+ for (int j = window_length_; j < fft_length_; ++j) {
+ fft_input_output_[j] = 0.0;
+ }
+ const int kForwardFFT = 1; // 1 means forward; -1 reverse.
+ // This real FFT is a fair amount faster than using cdft here.
+ rdft(fft_length_, kForwardFFT, &fft_input_output_[0],
+ &fft_integer_working_area_[0], &fft_double_working_area_[0]);
+ // Make rdft result look like cdft result;
+ // unpack the last real value from the first position's imag slot.
+ fft_input_output_[fft_length_] = fft_input_output_[1];
+ fft_input_output_[fft_length_ + 1] = 0;
+ fft_input_output_[1] = 0;
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/spectrogram.h b/tensorflow/core/kernels/spectrogram.h
new file mode 100644
index 0000000000..5476a0a961
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram.h
@@ -0,0 +1,112 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Class for generating spectrogram slices from a waveform.
+// Initialize() should be called before calls to other functions. Once
+// Initialize() has been called and returned true, The Compute*() functions can
+// be called repeatedly with sequential input data (ie. the first element of the
+// next input vector directly follows the last element of the previous input
+// vector). Whenever enough audio samples are buffered to produce a
+// new frame, it will be placed in output. Output is cleared on each
+// call to Compute*(). This class is thread-unsafe, and should only be
+// called from one thread at a time.
+// With the default parameters, the output of this class should be very
+// close to the results of the following MATLAB code:
+// overlap_samples = window_length_samples - step_samples;
+// window = hann(window_length_samples, 'periodic');
+// S = abs(spectrogram(audio, window, overlap_samples)).^2;
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
+
+#include <complex>
+#include <deque>
+#include <vector>
+
+#include "third_party/fft2d/fft.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+class Spectrogram {
+ public:
+ Spectrogram() : initialized_(false) {}
+ ~Spectrogram() {}
+
+ // Initializes the class with a given window length and step length
+ // (both in samples). Internally a Hann window is used as the window
+ // function. Returns true on success, after which calls to Process()
+ // are possible. window_length must be greater than 1 and step
+ // length must be greater than 0.
+ bool Initialize(int window_length, int step_length);
+
+ // Initialize with an explicit window instead of a length.
+ bool Initialize(const std::vector<double>& window, int step_length);
+
+ // Processes an arbitrary amount of audio data (contained in input)
+ // to yield complex spectrogram frames. After a successful call to
+ // Initialize(), Process() may be called repeatedly with new input data
+ // each time. The audio input is buffered internally, and the output
+ // vector is populated with as many temporally-ordered spectral slices
+ // as it is possible to generate from the input. The output is cleared
+ // on each call before the new frames (if any) are added.
+ //
+ // The template parameters can be float or double.
+ template <class InputSample, class OutputSample>
+ bool ComputeComplexSpectrogram(
+ const std::vector<InputSample>& input,
+ std::vector<std::vector<std::complex<OutputSample>>>* output);
+
+ // This function works as the one above, but returns the power
+ // (the L2 norm, or the squared magnitude) of each complex value.
+ template <class InputSample, class OutputSample>
+ bool ComputeSquaredMagnitudeSpectrogram(
+ const std::vector<InputSample>& input,
+ std::vector<std::vector<OutputSample>>* output);
+
+ // Return reference to the window function used internally.
+ const std::vector<double>& GetWindow() const { return window_; }
+
+ // Return the number of frequency channels in the spectrogram.
+ int output_frequency_channels() const { return output_frequency_channels_; }
+
+ private:
+ template <class InputSample>
+ bool GetNextWindowOfSamples(const std::vector<InputSample>& input,
+ int* input_start);
+ void ProcessCoreFFT();
+
+ int fft_length_;
+ int output_frequency_channels_;
+ int window_length_;
+ int step_length_;
+ bool initialized_;
+ int samples_to_next_step_;
+
+ std::vector<double> window_;
+ std::vector<double> fft_input_output_;
+ std::deque<double> input_queue_;
+
+ // Working data areas for the FFT routines.
+ std::vector<int> fft_integer_working_area_;
+ std::vector<double> fft_double_working_area_;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(Spectrogram);
+};
+
+} // namespace tensorflow
+
+#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
diff --git a/tensorflow/core/kernels/spectrogram_convert_test_data.cc b/tensorflow/core/kernels/spectrogram_convert_test_data.cc
new file mode 100644
index 0000000000..bae13c0213
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_convert_test_data.cc
@@ -0,0 +1,56 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/spectrogram_test_utils.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace wav {
+
+// This takes a CSV file representing an array of complex numbers, and saves out
+// a version using a binary format to save space in the repository.
+Status ConvertCsvToRaw(const string& input_filename) {
+ std::vector<std::vector<std::complex<double>>> input_data;
+ ReadCSVFileToComplexVectorOrDie(input_filename, &input_data);
+ const string output_filename = input_filename + ".bin";
+ if (!WriteComplexVectorToRawFloatFile(output_filename, input_data)) {
+ return errors::InvalidArgument("Failed to write raw float file ",
+ input_filename);
+ }
+ LOG(INFO) << "Wrote raw file to " << output_filename;
+ return Status::OK();
+}
+
+} // namespace wav
+} // namespace tensorflow
+
+int main(int argc, char* argv[]) {
+ tensorflow::port::InitMain(argv[0], &argc, &argv);
+ if (argc < 2) {
+ LOG(ERROR) << "You must supply a CSV file as the first argument";
+ return 1;
+ }
+ tensorflow::string filename(argv[1]);
+ tensorflow::Status status = tensorflow::wav::ConvertCsvToRaw(filename);
+ if (!status.ok()) {
+ LOG(ERROR) << "Error processing '" << filename << "':" << status;
+ return 1;
+ }
+ return 0;
+}
diff --git a/tensorflow/core/kernels/spectrogram_op.cc b/tensorflow/core/kernels/spectrogram_op.cc
new file mode 100644
index 0000000000..98d9bb1ad1
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_op.cc
@@ -0,0 +1,120 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/audio_ops.cc
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/spectrogram.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Create a spectrogram frequency visualization from audio data.
+class SpectrogramOp : public OpKernel {
+ public:
+ explicit SpectrogramOp(OpKernelConstruction* context) : OpKernel(context) {
+ OP_REQUIRES_OK(context, context->GetAttr("window_size", &window_size_));
+ OP_REQUIRES_OK(context, context->GetAttr("stride", &stride_));
+ OP_REQUIRES_OK(context,
+ context->GetAttr("magnitude_squared", &magnitude_squared_));
+ }
+
+ void Compute(OpKernelContext* context) override {
+ const Tensor& input = context->input(0);
+ OP_REQUIRES(context, input.dims() == 2,
+ errors::InvalidArgument("input must be 2-dimensional",
+ input.shape().DebugString()));
+ Spectrogram spectrogram;
+ OP_REQUIRES(context, spectrogram.Initialize(window_size_, stride_),
+ errors::InvalidArgument(
+ "Spectrogram initialization failed for window size ",
+ window_size_, " and stride ", stride_));
+
+ const auto input_as_matrix = input.matrix<float>();
+
+ const int64 sample_count = input.dim_size(0);
+ const int64 channel_count = input.dim_size(1);
+
+ const int64 output_width = spectrogram.output_frequency_channels();
+ const int64 length_minus_window = (sample_count - window_size_);
+ int64 output_height;
+ if (length_minus_window < 0) {
+ output_height = 0;
+ } else {
+ output_height = 1 + (length_minus_window / stride_);
+ }
+ const int64 output_slices = channel_count;
+
+ Tensor* output_tensor = nullptr;
+ OP_REQUIRES_OK(
+ context,
+ context->allocate_output(
+ 0, TensorShape({output_slices, output_height, output_width}),
+ &output_tensor));
+ auto output_flat = output_tensor->flat<float>().data();
+
+ std::vector<float> input_for_channel(sample_count);
+ for (int64 channel = 0; channel < channel_count; ++channel) {
+ float* output_slice =
+ output_flat + (channel * output_height * output_width);
+ for (int i = 0; i < sample_count; ++i) {
+ input_for_channel[i] = input_as_matrix(i, channel);
+ }
+ std::vector<std::vector<float>> spectrogram_output;
+ OP_REQUIRES(context,
+ spectrogram.ComputeSquaredMagnitudeSpectrogram(
+ input_for_channel, &spectrogram_output),
+ errors::InvalidArgument("Spectrogram compute failed"));
+ OP_REQUIRES(context, (spectrogram_output.size() == output_height),
+ errors::InvalidArgument(
+ "Spectrogram size calculation failed: Expected height ",
+ output_height, " but got ", spectrogram_output.size()));
+ OP_REQUIRES(context,
+ spectrogram_output.empty() ||
+ (spectrogram_output[0].size() == output_width),
+ errors::InvalidArgument(
+ "Spectrogram size calculation failed: Expected width ",
+ output_width, " but got ", spectrogram_output[0].size()));
+ for (int row_index = 0; row_index < output_height; ++row_index) {
+ const std::vector<float>& spectrogram_row =
+ spectrogram_output[row_index];
+ DCHECK_EQ(spectrogram_row.size(), output_width);
+ float* output_row = output_slice + (row_index * output_width);
+ if (magnitude_squared_) {
+ for (int i = 0; i < output_width; ++i) {
+ output_row[i] = spectrogram_row[i];
+ }
+ } else {
+ for (int i = 0; i < output_width; ++i) {
+ output_row[i] = sqrtf(spectrogram_row[i]);
+ }
+ }
+ }
+ }
+ }
+
+ private:
+ int32 window_size_;
+ int32 stride_;
+ bool magnitude_squared_;
+};
+REGISTER_KERNEL_BUILDER(Name("AudioSpectrogram").Device(DEVICE_CPU),
+ SpectrogramOp);
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/spectrogram_op_test.cc b/tensorflow/core/kernels/spectrogram_op_test.cc
new file mode 100644
index 0000000000..5c3cbeeeb9
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_op_test.cc
@@ -0,0 +1,104 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/cc/client/client_session.h"
+#include "tensorflow/cc/ops/audio_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+using namespace ops; // NOLINT(build/namespaces)
+
+TEST(SpectrogramOpTest, SimpleTest) {
+ Scope root = Scope::NewRootScope();
+
+ Tensor audio_tensor(DT_FLOAT, TensorShape({8, 1}));
+ test::FillValues<float>(&audio_tensor,
+ {-1.0f, 0.0f, 1.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f});
+
+ Output audio_const_op = Const(root.WithOpName("audio_const_op"),
+ Input::Initializer(audio_tensor));
+
+ AudioSpectrogram spectrogram_op =
+ AudioSpectrogram(root.WithOpName("spectrogram_op"), audio_const_op, 8, 1);
+
+ TF_ASSERT_OK(root.status());
+
+ ClientSession session(root);
+ std::vector<Tensor> outputs;
+
+ TF_EXPECT_OK(session.Run(ClientSession::FeedType(),
+ {spectrogram_op.spectrogram}, &outputs));
+
+ const Tensor& spectrogram_tensor = outputs[0];
+
+ EXPECT_EQ(3, spectrogram_tensor.dims());
+ EXPECT_EQ(5, spectrogram_tensor.dim_size(2));
+ EXPECT_EQ(1, spectrogram_tensor.dim_size(1));
+ EXPECT_EQ(1, spectrogram_tensor.dim_size(0));
+
+ test::ExpectTensorNear<float>(
+ spectrogram_tensor,
+ test::AsTensor<float>({0, 1, 2, 1, 0}, TensorShape({1, 1, 5})), 1e-3);
+}
+
+TEST(SpectrogramOpTest, SquaredTest) {
+ Scope root = Scope::NewRootScope();
+
+ Tensor audio_tensor(DT_FLOAT, TensorShape({8, 1}));
+ test::FillValues<float>(&audio_tensor,
+ {-1.0f, 0.0f, 1.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f});
+
+ Output audio_const_op = Const(root.WithOpName("audio_const_op"),
+ Input::Initializer(audio_tensor));
+
+ AudioSpectrogram spectrogram_op =
+ AudioSpectrogram(root.WithOpName("spectrogram_op"), audio_const_op, 8, 1,
+ AudioSpectrogram::Attrs().MagnitudeSquared(true));
+
+ TF_ASSERT_OK(root.status());
+
+ ClientSession session(root);
+ std::vector<Tensor> outputs;
+
+ TF_EXPECT_OK(session.Run(ClientSession::FeedType(),
+ {spectrogram_op.spectrogram}, &outputs));
+
+ const Tensor& spectrogram_tensor = outputs[0];
+
+ EXPECT_EQ(3, spectrogram_tensor.dims());
+ EXPECT_EQ(5, spectrogram_tensor.dim_size(2));
+ EXPECT_EQ(1, spectrogram_tensor.dim_size(1));
+ EXPECT_EQ(1, spectrogram_tensor.dim_size(0));
+
+ test::ExpectTensorNear<float>(
+ spectrogram_tensor,
+ test::AsTensor<float>({0, 1, 4, 1, 0}, TensorShape({1, 1, 5})), 1e-3);
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/spectrogram_test.cc b/tensorflow/core/kernels/spectrogram_test.cc
new file mode 100644
index 0000000000..73175a91a0
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_test.cc
@@ -0,0 +1,340 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The MATLAB test data were generated using GenerateTestData.m.
+
+#include "tensorflow/core/kernels/spectrogram.h"
+
+#include <complex>
+#include <vector>
+
+#include "tensorflow/core/kernels/spectrogram_test_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+using ::std::complex;
+
+const char kInputFilename[] =
+ "core/kernels/spectrogram_test_data/short_test_segment.wav";
+
+const char kExpectedFilename[] =
+ "core/kernels/spectrogram_test_data/short_test_segment_spectrogram.csv.bin";
+const int kDataVectorLength = 257;
+const int kNumberOfFramesInTestData = 178;
+
+const char kExpectedNonPowerOfTwoFilename[] =
+ "core/kernels/spectrogram_test_data/"
+ "short_test_segment_spectrogram_400_200.csv.bin";
+const int kNonPowerOfTwoDataVectorLength = 257;
+const int kNumberOfFramesInNonPowerOfTwoTestData = 228;
+
+TEST(SpectrogramTest, TooLittleDataYieldsNoFrames) {
+ Spectrogram sgram;
+ sgram.Initialize(400, 200);
+ std::vector<double> input;
+ // Generate 44 samples of audio.
+ SineWave(44100, 1000.0, 0.001, &input);
+ EXPECT_EQ(44, input.size());
+ std::vector<std::vector<complex<double>>> output;
+ sgram.ComputeComplexSpectrogram(input, &output);
+ EXPECT_EQ(0, output.size());
+}
+
+TEST(SpectrogramTest, StepSizeSmallerThanWindow) {
+ Spectrogram sgram;
+ EXPECT_TRUE(sgram.Initialize(400, 200));
+ std::vector<double> input;
+ // Generate 661 samples of audio.
+ SineWave(44100, 1000.0, 0.015, &input);
+ EXPECT_EQ(661, input.size());
+ std::vector<std::vector<complex<double>>> output;
+ sgram.ComputeComplexSpectrogram(input, &output);
+ EXPECT_EQ(2, output.size());
+}
+
+TEST(SpectrogramTest, StepSizeBiggerThanWindow) {
+ Spectrogram sgram;
+ EXPECT_TRUE(sgram.Initialize(200, 400));
+ std::vector<double> input;
+ // Generate 882 samples of audio.
+ SineWave(44100, 1000.0, 0.02, &input);
+ EXPECT_EQ(882, input.size());
+ std::vector<std::vector<complex<double>>> output;
+ sgram.ComputeComplexSpectrogram(input, &output);
+ EXPECT_EQ(2, output.size());
+}
+
+TEST(SpectrogramTest, StepSizeBiggerThanWindow2) {
+ Spectrogram sgram;
+ EXPECT_TRUE(sgram.Initialize(200, 400));
+ std::vector<double> input;
+ // Generate more than 600 but fewer than 800 samples of audio.
+ SineWave(44100, 1000.0, 0.016, &input);
+ EXPECT_GT(input.size(), 600);
+ EXPECT_LT(input.size(), 800);
+ std::vector<std::vector<complex<double>>> output;
+ sgram.ComputeComplexSpectrogram(input, &output);
+ EXPECT_EQ(2, output.size());
+}
+
+TEST(SpectrogramTest,
+ MultipleCallsToComputeComplexSpectrogramMayYieldDifferentNumbersOfFrames) {
+ // Repeatedly pass inputs with "extra" samples beyond complete windows
+ // and check that the excess points cumulate to eventually cause an
+ // extra output frame.
+ Spectrogram sgram;
+ sgram.Initialize(200, 400);
+ std::vector<double> input;
+ // Generate 882 samples of audio.
+ SineWave(44100, 1000.0, 0.02, &input);
+ EXPECT_EQ(882, input.size());
+ std::vector<std::vector<complex<double>>> output;
+ const std::vector<int> expected_output_sizes = {
+ 2, // One pass of input leaves 82 samples buffered after two steps of
+ // 400.
+ 2, // Passing in 882 samples again will now leave 164 samples buffered.
+ 3, // Third time gives 246 extra samples, triggering an extra output
+ // frame.
+ };
+ for (int expected_output_size : expected_output_sizes) {
+ sgram.ComputeComplexSpectrogram(input, &output);
+ EXPECT_EQ(expected_output_size, output.size());
+ }
+}
+
+TEST(SpectrogramTest, CumulatingExcessInputsForOverlappingFrames) {
+ // Input frames that don't fit into whole windows are cumulated even when
+ // the windows have overlap (similar to
+ // MultipleCallsToComputeComplexSpectrogramMayYieldDifferentNumbersOfFrames
+ // but with window size/hop size swapped).
+ Spectrogram sgram;
+ sgram.Initialize(400, 200);
+ std::vector<double> input;
+ // Generate 882 samples of audio.
+ SineWave(44100, 1000.0, 0.02, &input);
+ EXPECT_EQ(882, input.size());
+ std::vector<std::vector<complex<double>>> output;
+ const std::vector<int> expected_output_sizes = {
+ 3, // Windows 0..400, 200..600, 400..800 with 82 samples buffered.
+ 4, // 1764 frames input; outputs from 600, 800, 1000, 1200..1600.
+ 5, // 2646 frames in; outputs from 1400, 1600, 1800, 2000, 2200..2600.
+ };
+ for (int expected_output_size : expected_output_sizes) {
+ sgram.ComputeComplexSpectrogram(input, &output);
+ EXPECT_EQ(expected_output_size, output.size());
+ }
+}
+
+TEST(SpectrogramTest, StepSizeEqualToWindowWorks) {
+ Spectrogram sgram;
+ sgram.Initialize(200, 200);
+ std::vector<double> input;
+ // Generate 2205 samples of audio.
+ SineWave(44100, 1000.0, 0.05, &input);
+ EXPECT_EQ(2205, input.size());
+ std::vector<std::vector<complex<double>>> output;
+ sgram.ComputeComplexSpectrogram(input, &output);
+ EXPECT_EQ(11, output.size());
+}
+
+template <class ExpectedSample, class ActualSample>
+void CompareComplexData(
+ const std::vector<std::vector<complex<ExpectedSample>>>& expected,
+ const std::vector<std::vector<complex<ActualSample>>>& actual,
+ double tolerance) {
+ ASSERT_EQ(actual.size(), expected.size());
+ for (int i = 0; i < expected.size(); ++i) {
+ ASSERT_EQ(expected[i].size(), actual[i].size());
+ for (int j = 0; j < expected[i].size(); ++j) {
+ ASSERT_NEAR(real(expected[i][j]), real(actual[i][j]), tolerance)
+ << ": where i=" << i << " and j=" << j << ".";
+ ASSERT_NEAR(imag(expected[i][j]), imag(actual[i][j]), tolerance)
+ << ": where i=" << i << " and j=" << j << ".";
+ }
+ }
+}
+
+template <class Sample>
+double GetMaximumAbsolute(const std::vector<std::vector<Sample>>& spectrogram) {
+ double max_absolute = 0.0;
+ for (int i = 0; i < spectrogram.size(); ++i) {
+ for (int j = 0; j < spectrogram[i].size(); ++j) {
+ double absolute_value = std::abs(spectrogram[i][j]);
+ if (absolute_value > max_absolute) {
+ max_absolute = absolute_value;
+ }
+ }
+ }
+ return max_absolute;
+}
+
+template <class ExpectedSample, class ActualSample>
+void CompareMagnitudeData(
+ const std::vector<std::vector<complex<ExpectedSample>>>&
+ expected_complex_output,
+ const std::vector<std::vector<ActualSample>>& actual_squared_magnitude,
+ double tolerance) {
+ ASSERT_EQ(actual_squared_magnitude.size(), expected_complex_output.size());
+ for (int i = 0; i < expected_complex_output.size(); ++i) {
+ ASSERT_EQ(expected_complex_output[i].size(),
+ actual_squared_magnitude[i].size());
+ for (int j = 0; j < expected_complex_output[i].size(); ++j) {
+ ASSERT_NEAR(norm(expected_complex_output[i][j]),
+ actual_squared_magnitude[i][j], tolerance)
+ << ": where i=" << i << " and j=" << j << ".";
+ }
+ }
+}
+
+TEST(SpectrogramTest, ReInitializationWorks) {
+ Spectrogram sgram;
+ sgram.Initialize(512, 256);
+ std::vector<double> input;
+ CHECK(ReadWaveFileToVector(
+ tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
+ &input));
+ std::vector<std::vector<complex<double>>> first_output;
+ std::vector<std::vector<complex<double>>> second_output;
+ sgram.Initialize(512, 256);
+ sgram.ComputeComplexSpectrogram(input, &first_output);
+ // Re-Initialize it.
+ sgram.Initialize(512, 256);
+ sgram.ComputeComplexSpectrogram(input, &second_output);
+ // Verify identical outputs.
+ ASSERT_EQ(first_output.size(), second_output.size());
+ int slice_size = first_output[0].size();
+ for (int i = 0; i < first_output.size(); ++i) {
+ ASSERT_EQ(slice_size, first_output[i].size());
+ ASSERT_EQ(slice_size, second_output[i].size());
+ for (int j = 0; j < slice_size; ++j) {
+ ASSERT_EQ(first_output[i][j], second_output[i][j]);
+ }
+ }
+}
+
+TEST(SpectrogramTest, ComputedComplexDataAgreeWithMatlab) {
+ const int kInputDataLength = 45870;
+ Spectrogram sgram;
+ sgram.Initialize(512, 256);
+ std::vector<double> input;
+ CHECK(ReadWaveFileToVector(
+ tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
+ &input));
+ EXPECT_EQ(kInputDataLength, input.size());
+ std::vector<std::vector<complex<double>>> expected_output;
+ ASSERT_TRUE(ReadRawFloatFileToComplexVector(
+ tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kExpectedFilename),
+ kDataVectorLength, &expected_output));
+ EXPECT_EQ(kNumberOfFramesInTestData, expected_output.size());
+ EXPECT_EQ(kDataVectorLength, expected_output[0].size());
+ std::vector<std::vector<complex<double>>> output;
+ sgram.ComputeComplexSpectrogram(input, &output);
+ CompareComplexData(expected_output, output, 1e-5);
+}
+
+TEST(SpectrogramTest, ComputedFloatComplexDataAgreeWithMatlab) {
+ const int kInputDataLength = 45870;
+ Spectrogram sgram;
+ sgram.Initialize(512, 256);
+ std::vector<double> double_input;
+ CHECK(ReadWaveFileToVector(
+ tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
+ &double_input));
+ std::vector<float> input;
+ input.assign(double_input.begin(), double_input.end());
+ EXPECT_EQ(kInputDataLength, input.size());
+ std::vector<std::vector<complex<double>>> expected_output;
+ ASSERT_TRUE(ReadRawFloatFileToComplexVector(
+ tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kExpectedFilename),
+ kDataVectorLength, &expected_output));
+ EXPECT_EQ(kNumberOfFramesInTestData, expected_output.size());
+ EXPECT_EQ(kDataVectorLength, expected_output[0].size());
+ std::vector<std::vector<complex<float>>> output;
+ sgram.ComputeComplexSpectrogram(input, &output);
+ CompareComplexData(expected_output, output, 1e-4);
+}
+
+TEST(SpectrogramTest, ComputedSquaredMagnitudeDataAgreeWithMatlab) {
+ const int kInputDataLength = 45870;
+ Spectrogram sgram;
+ sgram.Initialize(512, 256);
+ std::vector<double> input;
+ CHECK(ReadWaveFileToVector(
+ tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
+ &input));
+ EXPECT_EQ(kInputDataLength, input.size());
+ std::vector<std::vector<complex<double>>> expected_output;
+ ASSERT_TRUE(ReadRawFloatFileToComplexVector(
+ tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kExpectedFilename),
+ kDataVectorLength, &expected_output));
+ EXPECT_EQ(kNumberOfFramesInTestData, expected_output.size());
+ EXPECT_EQ(kDataVectorLength, expected_output[0].size());
+ std::vector<std::vector<double>> output;
+ sgram.ComputeSquaredMagnitudeSpectrogram(input, &output);
+ CompareMagnitudeData(expected_output, output, 1e-3);
+}
+
+TEST(SpectrogramTest, ComputedFloatSquaredMagnitudeDataAgreeWithMatlab) {
+ const int kInputDataLength = 45870;
+ Spectrogram sgram;
+ sgram.Initialize(512, 256);
+ std::vector<double> double_input;
+ CHECK(ReadWaveFileToVector(
+ tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
+ &double_input));
+ EXPECT_EQ(kInputDataLength, double_input.size());
+ std::vector<float> input;
+ input.assign(double_input.begin(), double_input.end());
+ std::vector<std::vector<complex<double>>> expected_output;
+ ASSERT_TRUE(ReadRawFloatFileToComplexVector(
+ tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kExpectedFilename),
+ kDataVectorLength, &expected_output));
+ EXPECT_EQ(kNumberOfFramesInTestData, expected_output.size());
+ EXPECT_EQ(kDataVectorLength, expected_output[0].size());
+ std::vector<std::vector<float>> output;
+ sgram.ComputeSquaredMagnitudeSpectrogram(input, &output);
+ double max_absolute = GetMaximumAbsolute(output);
+ EXPECT_GT(max_absolute, 2300.0); // Verify that we have some big numbers.
+ // Squaring increases dynamic range; max square is about 2300,
+ // so 2e-4 is about 7 decimal digits; not bad for a float.
+ CompareMagnitudeData(expected_output, output, 2e-4);
+}
+
+TEST(SpectrogramTest, ComputedNonPowerOfTwoComplexDataAgreeWithMatlab) {
+ const int kInputDataLength = 45870;
+ Spectrogram sgram;
+ sgram.Initialize(400, 200);
+ std::vector<double> input;
+ CHECK(ReadWaveFileToVector(
+ tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
+ &input));
+ EXPECT_EQ(kInputDataLength, input.size());
+ std::vector<std::vector<complex<double>>> expected_output;
+ ASSERT_TRUE(ReadRawFloatFileToComplexVector(
+ tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(),
+ kExpectedNonPowerOfTwoFilename),
+ kNonPowerOfTwoDataVectorLength, &expected_output));
+ EXPECT_EQ(kNumberOfFramesInNonPowerOfTwoTestData, expected_output.size());
+ EXPECT_EQ(kNonPowerOfTwoDataVectorLength, expected_output[0].size());
+ std::vector<std::vector<complex<double>>> output;
+ sgram.ComputeComplexSpectrogram(input, &output);
+ CompareComplexData(expected_output, output, 1e-5);
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/spectrogram_test_data/README b/tensorflow/core/kernels/spectrogram_test_data/README
new file mode 100644
index 0000000000..271238e0c9
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_test_data/README
@@ -0,0 +1,8 @@
+The CSV spectrogram files in this directory are generated from the
+matlab code in ./matlab/GenerateTestData.m
+To save space in the repo, you'll then need to convert them into a binary packed
+format using the convert_test_data.cc command line tool.
+
+
+short_test_segment.wav is approximately 1s of music audio.
+
diff --git a/tensorflow/core/kernels/spectrogram_test_data/short_test_segment.wav b/tensorflow/core/kernels/spectrogram_test_data/short_test_segment.wav
new file mode 100644
index 0000000000..7339dfd08c
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_test_data/short_test_segment.wav
Binary files differ
diff --git a/tensorflow/core/kernels/spectrogram_test_data/short_test_segment_spectrogram.csv.bin b/tensorflow/core/kernels/spectrogram_test_data/short_test_segment_spectrogram.csv.bin
new file mode 100644
index 0000000000..67b9e2487c
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_test_data/short_test_segment_spectrogram.csv.bin
Binary files differ
diff --git a/tensorflow/core/kernels/spectrogram_test_data/short_test_segment_spectrogram_400_200.csv.bin b/tensorflow/core/kernels/spectrogram_test_data/short_test_segment_spectrogram_400_200.csv.bin
new file mode 100644
index 0000000000..d5e4cc5dd6
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_test_data/short_test_segment_spectrogram_400_200.csv.bin
Binary files differ
diff --git a/tensorflow/core/kernels/spectrogram_test_utils.cc b/tensorflow/core/kernels/spectrogram_test_utils.cc
new file mode 100644
index 0000000000..a2141c649f
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_test_utils.cc
@@ -0,0 +1,288 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/spectrogram_test_utils.h"
+
+#include <math.h>
+#include <stddef.h>
+
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/wav/wav_io.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+bool ReadWaveFileToVector(const string& file_name, std::vector<double>* data) {
+ string wav_data;
+ if (!ReadFileToString(Env::Default(), file_name, &wav_data).ok()) {
+ LOG(ERROR) << "Wave file read failed for " << file_name;
+ return false;
+ }
+ std::vector<float> decoded_data;
+ uint32 decoded_sample_count;
+ uint16 decoded_channel_count;
+ uint32 decoded_sample_rate;
+ if (!wav::DecodeLin16WaveAsFloatVector(
+ wav_data, &decoded_data, &decoded_sample_count,
+ &decoded_channel_count, &decoded_sample_rate)
+ .ok()) {
+ return false;
+ }
+ // Convert from float to double for the output value.
+ data->resize(decoded_data.size());
+ for (int i = 0; i < decoded_data.size(); ++i) {
+ (*data)[i] = decoded_data[i];
+ }
+ return true;
+}
+
+bool ReadRawFloatFileToComplexVector(
+ const string& file_name, int row_length,
+ std::vector<std::vector<std::complex<double> > >* data) {
+ data->clear();
+ string data_string;
+ if (!ReadFileToString(Env::Default(), file_name, &data_string).ok()) {
+ LOG(ERROR) << "Failed to open file " << file_name;
+ return false;
+ }
+ float real_out;
+ float imag_out;
+ const int kBytesPerValue = 4;
+ CHECK_EQ(sizeof(real_out), kBytesPerValue);
+ std::vector<std::complex<double> > data_row;
+ int row_counter = 0;
+ int offset = 0;
+ const int end = data_string.size();
+ while (offset < end) {
+ memcpy(&real_out, data_string.data() + offset, kBytesPerValue);
+ offset += kBytesPerValue;
+ memcpy(&imag_out, data_string.data() + offset, kBytesPerValue);
+ offset += kBytesPerValue;
+ if (row_counter >= row_length) {
+ data->push_back(data_row);
+ data_row.clear();
+ row_counter = 0;
+ }
+ data_row.push_back(std::complex<double>(real_out, imag_out));
+ ++row_counter;
+ }
+ if (row_counter >= row_length) {
+ data->push_back(data_row);
+ }
+ return true;
+}
+
+void ReadCSVFileToComplexVectorOrDie(
+ const string& file_name,
+ std::vector<std::vector<std::complex<double> > >* data) {
+ data->clear();
+ string data_string;
+ if (!ReadFileToString(Env::Default(), file_name, &data_string).ok()) {
+ LOG(FATAL) << "Failed to open file " << file_name;
+ return;
+ }
+ std::vector<string> lines = str_util::Split(data_string, '\n');
+ for (const string& line : lines) {
+ if (line == "") {
+ continue;
+ }
+ std::vector<std::complex<double> > data_line;
+ std::vector<string> values = str_util::Split(line, ',');
+ for (std::vector<string>::const_iterator i = values.begin();
+ i != values.end(); ++i) {
+ // each element of values may be in the form:
+ // 0.001+0.002i, 0.001, 0.001i, -1.2i, -1.2-3.2i, 1.5, 1.5e-03+21.0i
+ std::vector<string> parts;
+ // Find the first instance of + or - after the second character
+ // in the string, that does not immediately follow an 'e'.
+ size_t operator_index = i->find_first_of("+-", 2);
+ if (operator_index < i->size() &&
+ i->substr(operator_index - 1, 1) == "e") {
+ operator_index = i->find_first_of("+-", operator_index + 1);
+ }
+ parts.push_back(i->substr(0, operator_index));
+ if (operator_index < i->size()) {
+ parts.push_back(i->substr(operator_index, string::npos));
+ }
+
+ double real_part = 0.0;
+ double imaginary_part = 0.0;
+ for (std::vector<string>::const_iterator j = parts.begin();
+ j != parts.end(); ++j) {
+ if (j->find_first_of("ij") != string::npos) {
+ strings::safe_strtod((*j).c_str(), &imaginary_part);
+ } else {
+ strings::safe_strtod((*j).c_str(), &real_part);
+ }
+ }
+ data_line.push_back(std::complex<double>(real_part, imaginary_part));
+ }
+ data->push_back(data_line);
+ }
+}
+
+void ReadCSVFileToArrayOrDie(const string& filename,
+ std::vector<std::vector<float> >* array) {
+ string contents;
+ TF_CHECK_OK(ReadFileToString(Env::Default(), filename, &contents));
+ std::vector<string> lines = str_util::Split(contents, '\n');
+ contents.clear();
+
+ array->clear();
+ std::vector<float> values;
+ for (int l = 0; l < lines.size(); ++l) {
+ values.clear();
+ CHECK(str_util::SplitAndParseAsFloats(lines[l], ',', &values));
+ array->push_back(values);
+ }
+}
+
+bool WriteDoubleVectorToFile(const string& file_name,
+ const std::vector<double>& data) {
+ std::unique_ptr<WritableFile> file;
+ if (!Env::Default()->NewWritableFile(file_name, &file).ok()) {
+ LOG(ERROR) << "Failed to open file " << file_name;
+ return false;
+ }
+ for (int i = 0; i < data.size(); ++i) {
+ if (!file->Append(StringPiece(reinterpret_cast<const char*>(&(data[i])),
+ sizeof(data[i])))
+ .ok()) {
+ LOG(ERROR) << "Failed to append to file " << file_name;
+ return false;
+ }
+ }
+ if (!file->Close().ok()) {
+ LOG(ERROR) << "Failed to close file " << file_name;
+ return false;
+ }
+ return true;
+}
+
+bool WriteFloatVectorToFile(const string& file_name,
+ const std::vector<float>& data) {
+ std::unique_ptr<WritableFile> file;
+ if (!Env::Default()->NewWritableFile(file_name, &file).ok()) {
+ LOG(ERROR) << "Failed to open file " << file_name;
+ return false;
+ }
+ for (int i = 0; i < data.size(); ++i) {
+ if (!file->Append(StringPiece(reinterpret_cast<const char*>(&(data[i])),
+ sizeof(data[i])))
+ .ok()) {
+ LOG(ERROR) << "Failed to append to file " << file_name;
+ return false;
+ }
+ }
+ if (!file->Close().ok()) {
+ LOG(ERROR) << "Failed to close file " << file_name;
+ return false;
+ }
+ return true;
+}
+
+bool WriteDoubleArrayToFile(const string& file_name, int size,
+ const double* data) {
+ std::unique_ptr<WritableFile> file;
+ if (!Env::Default()->NewWritableFile(file_name, &file).ok()) {
+ LOG(ERROR) << "Failed to open file " << file_name;
+ return false;
+ }
+ for (int i = 0; i < size; ++i) {
+ if (!file->Append(StringPiece(reinterpret_cast<const char*>(&(data[i])),
+ sizeof(data[i])))
+ .ok()) {
+ LOG(ERROR) << "Failed to append to file " << file_name;
+ return false;
+ }
+ }
+ if (!file->Close().ok()) {
+ LOG(ERROR) << "Failed to close file " << file_name;
+ return false;
+ }
+ return true;
+}
+
+bool WriteFloatArrayToFile(const string& file_name, int size,
+ const float* data) {
+ std::unique_ptr<WritableFile> file;
+ if (!Env::Default()->NewWritableFile(file_name, &file).ok()) {
+ LOG(ERROR) << "Failed to open file " << file_name;
+ return false;
+ }
+ for (int i = 0; i < size; ++i) {
+ if (!file->Append(StringPiece(reinterpret_cast<const char*>(&(data[i])),
+ sizeof(data[i])))
+ .ok()) {
+ LOG(ERROR) << "Failed to append to file " << file_name;
+ return false;
+ }
+ }
+ if (!file->Close().ok()) {
+ LOG(ERROR) << "Failed to close file " << file_name;
+ return false;
+ }
+ return true;
+}
+
+bool WriteComplexVectorToRawFloatFile(
+ const string& file_name,
+ const std::vector<std::vector<std::complex<double> > >& data) {
+ std::unique_ptr<WritableFile> file;
+ if (!Env::Default()->NewWritableFile(file_name, &file).ok()) {
+ LOG(ERROR) << "Failed to open file " << file_name;
+ return false;
+ }
+ for (int i = 0; i < data.size(); ++i) {
+ for (int j = 0; j < data[i].size(); ++j) {
+ const float real_part(real(data[i][j]));
+ if (!file->Append(StringPiece(reinterpret_cast<const char*>(&real_part),
+ sizeof(real_part)))
+ .ok()) {
+ LOG(ERROR) << "Failed to append to file " << file_name;
+ return false;
+ }
+
+ const float imag_part(imag(data[i][j]));
+ if (!file->Append(StringPiece(reinterpret_cast<const char*>(&imag_part),
+ sizeof(imag_part)))
+ .ok()) {
+ LOG(ERROR) << "Failed to append to file " << file_name;
+ return false;
+ }
+ }
+ }
+ if (!file->Close().ok()) {
+ LOG(ERROR) << "Failed to close file " << file_name;
+ return false;
+ }
+ return true;
+}
+
+void SineWave(int sample_rate, float frequency, float duration_seconds,
+ std::vector<double>* data) {
+ data->clear();
+ for (int i = 0; i < static_cast<int>(sample_rate * duration_seconds); ++i) {
+ data->push_back(
+ sin(2.0 * M_PI * i * frequency / static_cast<double>(sample_rate)));
+ }
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/kernels/spectrogram_test_utils.h b/tensorflow/core/kernels/spectrogram_test_utils.h
new file mode 100644
index 0000000000..59a903549e
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_test_utils.h
@@ -0,0 +1,81 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
+
+#include <complex>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+// Reads a wav format file into a vector of floating-point values with range
+// -1.0 to 1.0.
+bool ReadWaveFileToVector(const string& file_name, std::vector<double>* data);
+
+// Reads a binary file containing 32-bit floating point values in the
+// form [real_1, imag_1, real_2, imag_2, ...] into a rectangular array
+// of complex values where row_length is the length of each inner vector.
+bool ReadRawFloatFileToComplexVector(
+ const string& file_name, int row_length,
+ std::vector<std::vector<std::complex<double> > >* data);
+
+// Reads a CSV file of numbers in the format 1.1+2.2i,1.1,2.2i,3.3j into data.
+void ReadCSVFileToComplexVectorOrDie(
+ const string& file_name,
+ std::vector<std::vector<std::complex<double> > >* data);
+
+// Reads a 2D array of floats from an ASCII text file, where each line is a row
+// of the array, and elements are separated by commas.
+void ReadCSVFileToArrayOrDie(const string& filename,
+ std::vector<std::vector<float> >* array);
+
+// Write a binary file containing 64-bit floating-point values for
+// reading by, for example, MATLAB.
+bool WriteDoubleVectorToFile(const string& file_name,
+ const std::vector<double>& data);
+
+// Write a binary file containing 32-bit floating-point values for
+// reading by, for example, MATLAB.
+bool WriteFloatVectorToFile(const string& file_name,
+ const std::vector<float>& data);
+
+// Write a binary file containing 64-bit floating-point values for
+// reading by, for example, MATLAB.
+bool WriteDoubleArrayToFile(const string& file_name, int size,
+ const double* data);
+
+// Write a binary file containing 32-bit floating-point values for
+// reading by, for example, MATLAB.
+bool WriteFloatArrayToFile(const string& file_name, int size,
+ const float* data);
+
+// Write a binary file in the format read by
+// ReadRawDoubleFileToComplexVector above.
+bool WriteComplexVectorToRawFloatFile(
+ const string& file_name,
+ const std::vector<std::vector<std::complex<double> > >& data);
+
+// Generate a sine wave with the provided parameters, and populate
+// data with the samples.
+void SineWave(int sample_rate, float frequency, float duration_seconds,
+ std::vector<double>* data);
+
+} // namespace tensorflow
+
+#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
diff --git a/tensorflow/core/lib/core/bits.h b/tensorflow/core/lib/core/bits.h
index 30ad0c2bea..1110ef5c2a 100644
--- a/tensorflow/core/lib/core/bits.h
+++ b/tensorflow/core/lib/core/bits.h
@@ -16,6 +16,7 @@ limitations under the License.
#ifndef TENSORFLOW_LIB_CORE_BITS_H_
#define TENSORFLOW_LIB_CORE_BITS_H_
+#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/types.h"
namespace tensorflow {
@@ -91,6 +92,18 @@ inline int Log2Ceiling64(uint64 n) {
return floor + 1;
}
+inline uint32 NextPowerOfTwo(uint32 value) {
+ int exponent = Log2Ceiling(value);
+ DCHECK_LT(exponent, std::numeric_limits<uint32>::digits);
+ return 1 << exponent;
+}
+
+inline uint64 NextPowerOfTwo64(uint64 value) {
+ int exponent = Log2Ceiling(value);
+ DCHECK_LT(exponent, std::numeric_limits<uint64>::digits);
+ return 1LL << exponent;
+}
+
} // namespace tensorflow
#endif // TENSORFLOW_LIB_CORE_BITS_H_
diff --git a/tensorflow/core/ops/audio_ops.cc b/tensorflow/core/ops/audio_ops.cc
index d6dedc3820..2f55e45e37 100644
--- a/tensorflow/core/ops/audio_ops.cc
+++ b/tensorflow/core/ops/audio_ops.cc
@@ -16,6 +16,7 @@ limitations under the License.
#include "tensorflow/core/framework/common_shape_fns.h"
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/bits.h"
namespace tensorflow {
@@ -66,6 +67,39 @@ Status EncodeWavShapeFn(InferenceContext* c) {
return Status::OK();
}
+Status SpectrogramShapeFn(InferenceContext* c) {
+ ShapeHandle input;
+ TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
+ int32 window_size;
+ TF_RETURN_IF_ERROR(c->GetAttr("window_size", &window_size));
+ int32 stride;
+ TF_RETURN_IF_ERROR(c->GetAttr("stride", &stride));
+
+ DimensionHandle input_channels = c->Dim(input, 0);
+ DimensionHandle input_length = c->Dim(input, 1);
+
+ DimensionHandle output_length;
+ if (!c->ValueKnown(input_length)) {
+ output_length = c->UnknownDim();
+ } else {
+ const int64 input_length_value = c->Value(input_length);
+ const int64 length_minus_window = (input_length_value - window_size);
+ int64 output_length_value;
+ if (length_minus_window < 0) {
+ output_length_value = 0;
+ } else {
+ output_length_value = 1 + (length_minus_window / stride);
+ }
+ output_length = c->MakeDim(output_length_value);
+ }
+
+ DimensionHandle output_channels =
+ c->MakeDim(1 + NextPowerOfTwo(window_size) / 2);
+ c->set_output(0,
+ c->MakeShape({input_channels, output_length, output_channels}));
+ return Status::OK();
+}
+
} // namespace
REGISTER_OP("DecodeWav")
@@ -121,4 +155,49 @@ sample_rate: Scalar containing the sample frequency.
contents: 0-D. WAV-encoded file contents.
)doc");
+REGISTER_OP("AudioSpectrogram")
+ .Input("input: float")
+ .Attr("window_size: int")
+ .Attr("stride: int")
+ .Attr("magnitude_squared: bool = false")
+ .Output("spectrogram: float")
+ .SetShapeFn(SpectrogramShapeFn)
+ .Doc(R"doc(
+Produces a visualization of audio data over time.
+
+Spectrograms are a standard way of representing audio information as a series of
+slices of frequency information, one slice for each window of time. By joining
+these together into a sequence, they form a distinctive fingerprint of the sound
+over time.
+
+This op expects to receive audio data as an input, stored as floats in the range
+-1 to 1, together with a window width in samples, and a stride specifying how
+far to move the window between slices. From this it generates a three
+dimensional output. The lowest dimension has an amplitude value for each
+frequency during that time slice. The next dimension is time, with successive
+frequency slices. The final dimension is for the channels in the input, so a
+stereo audio input would have two here for example.
+
+This means the layout when converted and saved as an image is rotated 90 degrees
+clockwise from a typical spectrogram. Time is descending down the Y axis, and
+the frequency decreases from left to right.
+
+Each value in the result represents the square root of the sum of the real and
+imaginary parts of an FFT on the current window of samples. In this way, the
+lowest dimension represents the power of each frequency in the current window,
+and adjacent windows are concatenated in the next dimension.
+
+To get a more intuitive and visual look at what this operation does, you can run
+tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
+resulting spectrogram as a PNG image.
+
+input: Float representation of audio data.
+window_size: How wide the input window is in samples. For the highest efficiency
+ this should be a power of two, but other values are accepted.
+stride: How widely apart the center of adjacent sample windows should be.
+magnitude_squared: Whether to return the squared magnitude or just the
+ magnitude. Using squared magnitude can avoid extra calculations.
+spectrogram: 3D representation of the audio frequencies as an image.
+)doc");
+
} // namespace tensorflow
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 0857010f7c..62af852173 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -92,6 +92,7 @@ cc_library(
"//tensorflow/core:protos_cc",
"@com_googlesource_code_re2//:re2",
"@farmhash_archive//:farmhash",
+ "@fft2d//:fft2d",
"@highwayhash//:sip_hash",
"@png_archive//:png",
],
diff --git a/tensorflow/core/util/command_line_flags.cc b/tensorflow/core/util/command_line_flags.cc
index 03eb076f30..8373eb1f9e 100644
--- a/tensorflow/core/util/command_line_flags.cc
+++ b/tensorflow/core/util/command_line_flags.cc
@@ -93,6 +93,22 @@ bool ParseBoolFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
return false;
}
+bool ParseFloatFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
+ float* dst, bool* value_parsing_ok) {
+ *value_parsing_ok = true;
+ if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
+ char extra;
+ if (sscanf(arg.data(), "%f%c", dst, &extra) != 1) {
+ LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
+ << ".";
+ *value_parsing_ok = false;
+ }
+ return true;
+ }
+
+ return false;
+}
+
} // namespace
Flag::Flag(const char* name, tensorflow::int32* dst, const string& usage_text)
@@ -116,6 +132,12 @@ Flag::Flag(const char* name, string* dst, const string& usage_text)
string_value_(dst),
usage_text_(usage_text) {}
+Flag::Flag(const char* name, float* dst, const string& usage_text)
+ : name_(name),
+ type_(TYPE_FLOAT),
+ float_value_(dst),
+ usage_text_(usage_text) {}
+
bool Flag::Parse(string arg, bool* value_parsing_ok) const {
bool result = false;
if (type_ == TYPE_INT) {
@@ -126,6 +148,8 @@ bool Flag::Parse(string arg, bool* value_parsing_ok) const {
result = ParseBoolFlag(arg, name_, bool_value_, value_parsing_ok);
} else if (type_ == TYPE_STRING) {
result = ParseStringFlag(arg, name_, string_value_, value_parsing_ok);
+ } else if (type_ == TYPE_FLOAT) {
+ result = ParseFloatFlag(arg, name_, float_value_, value_parsing_ok);
}
return result;
}
@@ -195,6 +219,10 @@ bool Flag::Parse(string arg, bool* value_parsing_ok) const {
type_name = "string";
flag_string = strings::Printf("--%s=\"%s\"", flag.name_.c_str(),
flag.string_value_->c_str());
+ } else if (flag.type_ == Flag::TYPE_FLOAT) {
+ type_name = "float";
+ flag_string =
+ strings::Printf("--%s=%f", flag.name_.c_str(), *flag.float_value_);
}
strings::Appendf(&usage_text, "\t%-33s\t%s\t%s\n", flag_string.c_str(),
type_name, flag.usage_text_.c_str());
diff --git a/tensorflow/core/util/command_line_flags.h b/tensorflow/core/util/command_line_flags.h
index 2c77d7874f..f349df16fd 100644
--- a/tensorflow/core/util/command_line_flags.h
+++ b/tensorflow/core/util/command_line_flags.h
@@ -65,6 +65,7 @@ class Flag {
Flag(const char* name, int64* dst1, const string& usage_text);
Flag(const char* name, bool* dst, const string& usage_text);
Flag(const char* name, string* dst, const string& usage_text);
+ Flag(const char* name, float* dst, const string& usage_text);
private:
friend class Flags;
@@ -72,11 +73,12 @@ class Flag {
bool Parse(string arg, bool* value_parsing_ok) const;
string name_;
- enum { TYPE_INT, TYPE_INT64, TYPE_BOOL, TYPE_STRING } type_;
+ enum { TYPE_INT, TYPE_INT64, TYPE_BOOL, TYPE_STRING, TYPE_FLOAT } type_;
int* int_value_;
int64* int64_value_;
bool* bool_value_;
string* string_value_;
+ float* float_value_;
string usage_text_;
};
diff --git a/tensorflow/core/util/command_line_flags_test.cc b/tensorflow/core/util/command_line_flags_test.cc
index b002e35899..62025463af 100644
--- a/tensorflow/core/util/command_line_flags_test.cc
+++ b/tensorflow/core/util/command_line_flags_test.cc
@@ -32,29 +32,35 @@ std::vector<char *> CharPointerVectorFromStrings(
}
return result;
}
-}
+} // namespace
TEST(CommandLineFlagsTest, BasicUsage) {
int some_int = 10;
int64 some_int64 = 21474836470; // max int32 is 2147483647
bool some_switch = false;
string some_name = "something";
- int argc = 5;
- std::vector<string> argv_strings = {
- "program_name", "--some_int=20", "--some_int64=214748364700",
- "--some_switch", "--some_name=somethingelse"};
+ float some_float = -23.23f;
+ int argc = 6;
+ std::vector<string> argv_strings = {"program_name",
+ "--some_int=20",
+ "--some_int64=214748364700",
+ "--some_switch",
+ "--some_name=somethingelse",
+ "--some_float=42.0"};
std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
bool parsed_ok =
Flags::Parse(&argc, argv_array.data(),
{Flag("some_int", &some_int, "some int"),
Flag("some_int64", &some_int64, "some int64"),
Flag("some_switch", &some_switch, "some switch"),
- Flag("some_name", &some_name, "some name")});
+ Flag("some_name", &some_name, "some name"),
+ Flag("some_float", &some_float, "some float")});
EXPECT_EQ(true, parsed_ok);
EXPECT_EQ(20, some_int);
EXPECT_EQ(214748364700, some_int64);
EXPECT_EQ(true, some_switch);
EXPECT_EQ("somethingelse", some_name);
+ EXPECT_NEAR(42.0f, some_float, 1e-5f);
EXPECT_EQ(argc, 1);
}
@@ -85,6 +91,21 @@ TEST(CommandLineFlagsTest, BadBoolValue) {
EXPECT_EQ(argc, 1);
}
+TEST(CommandLineFlagsTest, BadFloatValue) {
+ float some_float = -23.23f;
+ int argc = 2;
+ std::vector<string> argv_strings = {"program_name",
+ "--some_float=notanumber"};
+ std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
+ bool parsed_ok =
+ Flags::Parse(&argc, argv_array.data(),
+ {Flag("some_float", &some_float, "some float")});
+
+ EXPECT_EQ(false, parsed_ok);
+ EXPECT_NEAR(-23.23f, some_float, 1e-5f);
+ EXPECT_EQ(argc, 1);
+}
+
// Return whether str==pat, but allowing any whitespace in pat
// to match zero or more whitespace characters in str.
static bool MatchWithAnyWhitespace(const string &str, const string &pat) {
@@ -111,6 +132,8 @@ TEST(CommandLineFlagsTest, UsageString) {
int64 some_int64 = 21474836470; // max int32 is 2147483647
bool some_switch = false;
string some_name = "something";
+ // Don't test float in this case, because precision is hard to predict and
+ // match against, and we don't want a flakey test.
const string tool_name = "some_tool_name";
string usage = Flags::Usage(tool_name + "<flags>",
{Flag("some_int", &some_int, "some int"),
diff --git a/tensorflow/examples/wav_to_spectrogram/BUILD b/tensorflow/examples/wav_to_spectrogram/BUILD
new file mode 100644
index 0000000000..1e72324fb0
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/BUILD
@@ -0,0 +1,68 @@
+# Description:
+# TensorFlow C++ inference example for labeling images.
+
+package(
+ default_visibility = ["//tensorflow:internal"],
+ features = [
+ "-layering_check",
+ "-parse_headers",
+ ],
+)
+
+licenses(["notice"]) # Apache 2.0
+
+exports_files(["LICENSE"])
+
+cc_library(
+ name = "wav_to_spectrogram_lib",
+ srcs = [
+ "wav_to_spectrogram.cc",
+ ],
+ hdrs = [
+ "wav_to_spectrogram.h",
+ ],
+ deps = [
+ "//tensorflow/cc:cc_ops",
+ "//tensorflow/core:framework_internal",
+ "//tensorflow/core:tensorflow",
+ ],
+)
+
+cc_binary(
+ name = "wav_to_spectrogram",
+ srcs = [
+ "main.cc",
+ ],
+ deps = [
+ ":wav_to_spectrogram_lib",
+ "//tensorflow/core:framework_internal",
+ "//tensorflow/core:tensorflow",
+ ],
+)
+
+cc_test(
+ name = "wav_to_spectrogram_test",
+ size = "medium",
+ srcs = ["wav_to_spectrogram_test.cc"],
+ deps = [
+ ":wav_to_spectrogram_lib",
+ "//tensorflow/core:lib",
+ "//tensorflow/core:lib_internal",
+ "//tensorflow/core:test",
+ "//tensorflow/core:test_main",
+ ],
+)
+
+filegroup(
+ name = "all_files",
+ srcs = glob(
+ ["**/*"],
+ exclude = [
+ "**/METADATA",
+ "**/OWNERS",
+ "bin/**",
+ "gen/**",
+ ],
+ ),
+ visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/examples/wav_to_spectrogram/README.md b/tensorflow/examples/wav_to_spectrogram/README.md
new file mode 100644
index 0000000000..7f7eb43700
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/README.md
@@ -0,0 +1,49 @@
+# TensorFlow Spectrogram Example
+
+This example shows how you can load audio from a .wav file, convert it to a
+spectrogram, and then save it out as a PNG image. A spectrogram is a
+visualization of the frequencies in sound over time, and can be useful as a
+feature for neural network recognition on noise or speech.
+
+## Building
+
+To build it, run this command:
+
+```bash
+bazel build tensorflow/examples/wav_to_spectrogram/...
+```
+
+That should build a binary executable that you can then run like this:
+
+```bash
+bazel-bin/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram
+```
+
+This uses a default test audio file that's part of the TensorFlow source code,
+and writes out the image to the current directory as spectrogram.png.
+
+## Options
+
+To load your own audio, you need to supply a .wav file in LIN16 format, and use
+the `--input_audio` flag to pass in the path.
+
+To control how the spectrogram is created, you can specify the `--window_size`
+and `--stride` arguments, which control how wide the window used to estimate
+frequencies is, and how widely adjacent windows are spaced.
+
+The `--output_image` flag sets the path to save the image file to. This is
+always written out in PNG format, even if you specify a different file
+extension.
+
+If your result seems too dark, try using the `--brightness` flag to make the
+output image easier to see.
+
+Here's an example of how to use all of them together:
+
+```bash
+bazel-bin/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram \
+--input_wav=/tmp/my_audio.wav \
+--window=1024 \
+--stride=512 \
+--output_image=/tmp/my_spectrogram.png
+```
diff --git a/tensorflow/examples/wav_to_spectrogram/main.cc b/tensorflow/examples/wav_to_spectrogram/main.cc
new file mode 100644
index 0000000000..539e6c4fe4
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/main.cc
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+int main(int argc, char* argv[]) {
+ // These are the command-line flags the program can understand.
+ // They define where the graph and input data is located, and what kind of
+ // input the model expects. If you train your own model, or use something
+ // other than inception_v3, then you'll need to update these.
+ tensorflow::string input_wav =
+ "tensorflow/core/kernels/spectrogram_test_data/short_test_segment.wav";
+ tensorflow::int32 window_size = 256;
+ tensorflow::int32 stride = 128;
+ float brightness = 64.0f;
+ tensorflow::string output_image = "spectrogram.png";
+ std::vector<tensorflow::Flag> flag_list = {
+ tensorflow::Flag("input_wav", &input_wav, "audio file to load"),
+ tensorflow::Flag("window_size", &window_size,
+ "frequency sample window width"),
+ tensorflow::Flag("stride", &stride,
+ "how far apart to place frequency windows"),
+ tensorflow::Flag("brightness", &brightness,
+ "controls how bright the output image is"),
+ tensorflow::Flag("output_image", &output_image,
+ "where to save the spectrogram image to"),
+ };
+ tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+ const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+ if (!parse_result) {
+ LOG(ERROR) << usage;
+ return -1;
+ }
+
+ // We need to call this to set up global state for TensorFlow.
+ tensorflow::port::InitMain(argv[0], &argc, &argv);
+ if (argc > 1) {
+ LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
+ return -1;
+ }
+
+ tensorflow::Status wav_status = WavToSpectrogram(
+ input_wav, window_size, stride, brightness, output_image);
+ if (!wav_status.ok()) {
+ LOG(ERROR) << "WavToSpectrogram failed with " << wav_status;
+ return -1;
+ }
+
+ return 0;
+}
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
new file mode 100644
index 0000000000..c69a359637
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
@@ -0,0 +1,97 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h"
+
+#include <vector>
+
+#include "tensorflow/cc/ops/audio_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/default_device.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+using tensorflow::DT_FLOAT;
+using tensorflow::DT_UINT8;
+using tensorflow::Output;
+using tensorflow::TensorShape;
+
+// Runs a TensorFlow graph to convert an audio file into a visualization.
+tensorflow::Status WavToSpectrogram(const tensorflow::string& input_wav,
+ tensorflow::int32 window_size,
+ tensorflow::int32 stride, float brightness,
+ const tensorflow::string& output_image) {
+ auto root = tensorflow::Scope::NewRootScope();
+ using namespace tensorflow::ops; // NOLINT(build/namespaces)
+ // The following block creates a TensorFlow graph that:
+ // - Reads and decodes the audio file into a tensor of float samples.
+ // - Creates a float spectrogram from those samples.
+ // - Scales, clamps, and converts that spectrogram to 0 to 255 uint8's.
+ // - Reshapes the tensor so that it's [height, width, 1] for imaging.
+ // - Encodes it as a PNG stream and saves it out to a file.
+ Output file_reader = ReadFile(root.WithOpName("input_wav"), input_wav);
+ DecodeWav wav_decoder =
+ DecodeWav(root.WithOpName("wav_decoder"), file_reader);
+ Output spectrogram = AudioSpectrogram(root.WithOpName("spectrogram"),
+ wav_decoder.audio, window_size, stride);
+ Output brightness_placeholder =
+ Placeholder(root.WithOpName("brightness_placeholder"), DT_FLOAT,
+ Placeholder::Attrs().Shape(TensorShape({})));
+ Output mul = Mul(root.WithOpName("mul"), spectrogram, brightness_placeholder);
+ Output min_const = Const(root.WithOpName("min_const"), 255.0f);
+ Output min = Minimum(root.WithOpName("min"), mul, min_const);
+ Output cast = Cast(root.WithOpName("cast"), min, DT_UINT8);
+ Output expand_dims_const = Const(root.WithOpName("expand_dims_const"), -1);
+ Output expand_dims =
+ ExpandDims(root.WithOpName("expand_dims"), cast, expand_dims_const);
+ Output squeeze = Squeeze(root.WithOpName("squeeze"), expand_dims,
+ Squeeze::Attrs().SqueezeDims({0}));
+ Output png_encoder = EncodePng(root.WithOpName("png_encoder"), squeeze);
+ WriteFile file_writer =
+ WriteFile(root.WithOpName("output_image"), output_image, png_encoder);
+ tensorflow::GraphDef graph;
+ TF_RETURN_IF_ERROR(root.ToGraphDef(&graph));
+
+ // Build a session object from this graph definition. The power of TensorFlow
+ // is that you can reuse complex computations like this, so usually we'd run a
+ // lot of different inputs through it. In this example, we're just doing a
+ // one-off run, so we'll create it and then use it immediately.
+ std::unique_ptr<tensorflow::Session> session(
+ tensorflow::NewSession(tensorflow::SessionOptions()));
+ TF_RETURN_IF_ERROR(session->Create(graph));
+
+ // We're passing in the brightness as an input, so create a tensor to hold the
+ // value.
+ tensorflow::Tensor brightness_tensor(DT_FLOAT, TensorShape({}));
+ brightness_tensor.scalar<float>()() = brightness;
+
+ // Run the session to analyze the audio and write out the file.
+ TF_RETURN_IF_ERROR(
+ session->Run({{"brightness_placeholder", brightness_tensor}}, {},
+ {"output_image"}, nullptr));
+ return tensorflow::Status::OK();
+}
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
new file mode 100644
index 0000000000..fa8cb0abe9
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
+#define THIRD_PARTY_TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+// Runs a TensorFlow graph to convert an audio file into a visualization. Takes
+// in the path to the audio file, the window size and stride parameters
+// controlling the spectrogram creation, the brightness scaling to use, and a
+// path to save the output PNG file to.
+tensorflow::Status WavToSpectrogram(const tensorflow::string& input_wav,
+ tensorflow::int32 window_size,
+ tensorflow::int32 stride, float brightness,
+ const tensorflow::string& output_image);
+
+#endif // THIRD_PARTY_TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc
new file mode 100644
index 0000000000..e599711445
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/wav/wav_io.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+
+TEST(WavToSpectrogramTest, WavToSpectrogramTest) {
+ const tensorflow::string input_wav =
+ tensorflow::io::JoinPath(tensorflow::testing::TmpDir(), "input_wav.wav");
+ const tensorflow::string output_image = tensorflow::io::JoinPath(
+ tensorflow::testing::TmpDir(), "output_image.png");
+ float audio[8] = {-1.0f, 0.0f, 1.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
+ tensorflow::string wav_string;
+ TF_ASSERT_OK(
+ tensorflow::wav::EncodeAudioAsS16LEWav(audio, 44100, 1, 8, &wav_string));
+ TF_ASSERT_OK(tensorflow::WriteStringToFile(tensorflow::Env::Default(),
+ input_wav, wav_string));
+ TF_ASSERT_OK(WavToSpectrogram(input_wav, 4, 4, 64.0f, output_image));
+ TF_EXPECT_OK(tensorflow::Env::Default()->FileExists(output_image));
+}
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 3e049724f6..a8e6ecdbf0 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -79,11 +79,13 @@ genrule(
srcs = [
"//third_party/hadoop:LICENSE.txt",
"//third_party/eigen3:LICENSE",
+ "//third_party/fft2d:LICENSE",
"@boringssl//:LICENSE",
"@com_googlesource_code_re2//:LICENSE",
"@curl//:COPYING",
"@eigen_archive//:COPYING.MPL2",
"@farmhash_archive//:COPYING",
+ "@fft2d//:fft/readme.txt",
"@gemmlowp//:LICENSE",
"@gif_archive//:COPYING",
"@highwayhash//:LICENSE",
@@ -106,11 +108,13 @@ genrule(
srcs = [
"//third_party/hadoop:LICENSE.txt",
"//third_party/eigen3:LICENSE",
+ "//third_party/fft2d:LICENSE",
"@boringssl//:LICENSE",
"@com_googlesource_code_re2//:LICENSE",
"@curl//:COPYING",
"@eigen_archive//:COPYING.MPL2",
"@farmhash_archive//:COPYING",
+ "@fft2d//:fft/readme.txt",
"@gemmlowp//:LICENSE",
"@gif_archive//:COPYING",
"@highwayhash//:LICENSE",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 2a96e80ccb..c17a7f7fb1 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -91,12 +91,14 @@ filegroup(
name = "licenses",
data = [
"//third_party/eigen3:LICENSE",
+ "//third_party/fft2d:LICENSE",
"//third_party/hadoop:LICENSE.txt",
"@boringssl//:LICENSE",
"@com_googlesource_code_re2//:LICENSE",
"@curl//:COPYING",
"@eigen_archive//:COPYING.MPL2",
"@farmhash_archive//:COPYING",
+ "@fft2d//:fft/readme.txt",
"@gemmlowp//:LICENSE",
"@gif_archive//:COPYING",
"@grpc//:LICENSE",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 7bcdb1613d..dd42c69dd2 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -500,6 +500,16 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
name="zlib",
actual="@zlib_archive//:zlib",)
+ native.new_http_archive(
+ name = "fft2d",
+ urls = [
+ "http://bazel-mirror.storage.googleapis.com/www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz",
+ "http://www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz",
+ ],
+ sha256 = "52bb637c70b971958ec79c9c8752b1df5ff0218a4db4510e60826e0cb79b5296",
+ build_file = str(Label("//third_party/fft2d:fft2d.BUILD")),
+ )
+
temp_workaround_http_archive(
name="snappy",
urls=[
diff --git a/third_party/fft2d/BUILD b/third_party/fft2d/BUILD
new file mode 100644
index 0000000000..93ea06e81b
--- /dev/null
+++ b/third_party/fft2d/BUILD
@@ -0,0 +1,30 @@
+# Headers for 2D Fast Fourier Transform package
+# from http://momonga.t.u-tokyo.ac.jp/~ooura/fft.html
+# This is a separate package because the original downloaded archive doesn't
+# contain any header files.
+
+package(
+ default_visibility = ["//visibility:public"],
+)
+
+# Unrestricted use; can only distribute original package.
+# See fft/readme.txt
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+cc_library(
+ name = "fft2d_headers",
+ srcs = ["fft.h"],
+)
+
+objc_library(
+ name = "fft2d_headersd_ios",
+ srcs = ["fft.h"],
+)
+
+# Export the source code so that it could be compiled for Andoid native apps.
+filegroup(
+ name = "fft2d_headers_srcs",
+ srcs = ["fft.h"],
+)
diff --git a/third_party/fft2d/LICENSE b/third_party/fft2d/LICENSE
new file mode 100644
index 0000000000..2bd85506a8
--- /dev/null
+++ b/third_party/fft2d/LICENSE
@@ -0,0 +1,3 @@
+Copyright(C) 1997,2001 Takuya OOURA (email: ooura@kurims.kyoto-u.ac.jp).
+You may use, copy, modify this code for any purpose and
+without fee. You may distribute this ORIGINAL package.
diff --git a/third_party/fft2d/fft.h b/third_party/fft2d/fft.h
new file mode 100644
index 0000000000..252cc01fec
--- /dev/null
+++ b/third_party/fft2d/fft.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Declarations for 1D FFT routines in third_party/fft2d/fft.
+
+#ifndef THIRD_PARTY_FFT2D_FFT_H__
+#define THIRD_PARTY_FFT2D_FFT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void cdft(int, int, double *, int *, double *);
+extern void rdft(int, int, double *, int *, double *);
+extern void ddct(int, int, double *, int *, double *);
+extern void ddst(int, int, double *, int *, double *);
+extern void dfct(int, double *, double *, int *, double *);
+extern void dfst(int, double *, double *, int *, double *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // THIRD_PARTY_FFT2D_FFT_H__
diff --git a/third_party/fft2d/fft2d.BUILD b/third_party/fft2d/fft2d.BUILD
new file mode 100644
index 0000000000..3dbd36aec0
--- /dev/null
+++ b/third_party/fft2d/fft2d.BUILD
@@ -0,0 +1,36 @@
+# 2D Fast Fourier Transform package
+# from http://momonga.t.u-tokyo.ac.jp/~ooura/fft.html
+
+package(
+ default_visibility = ["//visibility:public"],
+)
+
+# Unrestricted use; can only distribute original package.
+licenses(["notice"])
+
+exports_files(["fft/readme.txt"])
+
+FFT2D_SRCS = [
+ "fft/fftsg.c",
+]
+
+# This is the main 2D FFT library. The 2D FFTs in this library call
+# 1D FFTs. In addition, fast DCTs are provided for the special case
+# of 8x8 and 16x16. This code in this library is referred to as
+# "Version II" on http://momonga.t.u-tokyo.ac.jp/~ooura/fft.html.
+cc_library(
+ name = "fft2d",
+ srcs = FFT2D_SRCS,
+ linkopts = ["-lm"],
+)
+
+objc_library(
+ name = "fft2d_ios",
+ srcs = FFT2D_SRCS,
+)
+
+# Export the source code so that it could be compiled for Andoid native apps.
+filegroup(
+ name = "fft2d_srcs",
+ srcs = FFT2D_SRCS,
+)