[tf-signal] Add tf.contrib.signal.mfccs_from_log_mel_spectrograms.

PiperOrigin-RevId: 170753517
author: RJ Ryan <rjryan@google.com> 2017-10-02 14:26:45 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-10-02 14:36:47 -0700
commit: f94d410c701a9b9e41b3094af0f66bf9490a9838 (patch)
tree: 7b41ed4f0bb858b12a15fdfbb90f7007a3876fa4 /tensorflow/contrib/signal
parent: 88cdf1f81fa1938c5bb81c5d293fc0ed0758cadc (diff)
4 files changed, 271 insertions, 0 deletions
diff --git a/tensorflow/contrib/signal/BUILD b/tensorflow/contrib/signal/BUILD
index 8c11cf0d64..6025ec5b57 100644
--- a/tensorflow/contrib/signal/BUILD
+++ b/tensorflow/contrib/signal/BUILD
@@ -35,6 +35,20 @@ cuda_py_tests(
 )
 
 cuda_py_tests(
+    name = "mfcc_ops_test",
+    srcs = ["python/kernel_tests/mfcc_ops_test.py"],
+    additional_deps = [
+        ":signal_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:spectral_ops_test_util",
+    ],
+)
+
+cuda_py_tests(
     name = "reconstruction_ops_test",
     srcs = ["python/kernel_tests/reconstruction_ops_test.py"],
     additional_deps = [
diff --git a/tensorflow/contrib/signal/__init__.py b/tensorflow/contrib/signal/__init__.py
index 25123b097e..0f2592b0b0 100644
--- a/tensorflow/contrib/signal/__init__.py
+++ b/tensorflow/contrib/signal/__init__.py
@@ -20,6 +20,7 @@ See the @{$python/contrib.signal} guide.
 @@hamming_window
 @@hann_window
 @@inverse_stft
+@@mfccs_from_log_mel_spectrograms
 @@linear_to_mel_weight_matrix
 @@overlap_and_add
 @@stft
@@ -27,6 +28,7 @@ See the @{$python/contrib.signal} guide.
 [hamming]: https://en.wikipedia.org/wiki/Window_function#Hamming_window
 [hann]: https://en.wikipedia.org/wiki/Window_function#Hann_window
 [mel]: https://en.wikipedia.org/wiki/Mel_scale
+[mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
 [stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform
 """
 
@@ -35,6 +37,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.signal.python.ops.mel_ops import linear_to_mel_weight_matrix
+from tensorflow.contrib.signal.python.ops.mfcc_ops import mfccs_from_log_mel_spectrograms
 from tensorflow.contrib.signal.python.ops.reconstruction_ops import overlap_and_add
 from tensorflow.contrib.signal.python.ops.shape_ops import frame
 # `frame` used to be named `frames`, which is a noun and not a verb.
diff --git a/tensorflow/contrib/signal/python/kernel_tests/mfcc_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/mfcc_ops_test.py
new file mode 100644
index 0000000000..b3a8d40c13
--- /dev/null
+++ b/tensorflow/contrib/signal/python/kernel_tests/mfcc_ops_test.py
@@ -0,0 +1,117 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for mfcc_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+
+import numpy as np
+
+
+from tensorflow.contrib.signal.python.ops import mfcc_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+# TODO(rjryan): Add scipy.fftpack to the TensorFlow build.
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+fftpack = try_import("scipy.fftpack")
+
+
+class DCTTest(test.TestCase):
+
+  def _np_dct2(self, signals, norm=None):
+    """Computes the DCT-II manually with NumPy."""
+    # X_k = sum_{n=0}^{N-1} x_n * cos(\frac{pi}{N} * (n + 0.5) * k)  k=0,...,N-1
+    dct_size = signals.shape[-1]
+    dct = np.zeros_like(signals)
+    for k in range(dct_size):
+      phi = np.cos(np.pi * (np.arange(dct_size) + 0.5) * k / dct_size)
+      dct[..., k] = np.sum(signals * phi, axis=-1)
+    # SciPy's `dct` has a scaling factor of 2.0 which we follow.
+    # https://github.com/scipy/scipy/blob/v0.15.1/scipy/fftpack/src/dct.c.src
+    if norm == "ortho":
+      # The orthogonal scaling includes a factor of 0.5 which we combine with
+      # the overall scaling of 2.0 to cancel.
+      dct[..., 0] *= np.sqrt(1.0 / dct_size)
+      dct[..., 1:] *= np.sqrt(2.0 / dct_size)
+    else:
+      dct *= 2.0
+    return dct
+
+  def test_compare_to_numpy(self):
+    """Compare dct against a manual DCT-II implementation."""
+    with spectral_ops_test_util.fft_kernel_label_map():
+      with self.test_session(use_gpu=True):
+        for size in range(1, 23):
+          signals = np.random.rand(size).astype(np.float32)
+          actual_dct = mfcc_ops._dct2_1d(signals).eval()
+          expected_dct = self._np_dct2(signals)
+          self.assertAllClose(expected_dct, actual_dct, atol=5e-4, rtol=5e-4)
+
+  def test_compare_to_fftpack(self):
+    """Compare dct against scipy.fftpack.dct."""
+    if not fftpack:
+      return
+    with spectral_ops_test_util.fft_kernel_label_map():
+      with self.test_session(use_gpu=True):
+        for size in range(1, 23):
+          signal = np.random.rand(size).astype(np.float32)
+          actual_dct = mfcc_ops._dct2_1d(signal).eval()
+          expected_dct = fftpack.dct(signal, type=2)
+          self.assertAllClose(expected_dct, actual_dct, atol=5e-4, rtol=5e-4)
+
+
+# TODO(rjryan): We have no open source tests for MFCCs at the moment. Internally
+# at Google, this code is tested against a reference implementation that follows
+# HTK conventions.
+class MFCCTest(test.TestCase):
+
+  def test_error(self):
+    # num_mel_bins must be positive.
+    with self.assertRaises(ValueError):
+      signal = array_ops.zeros((2, 3, 0))
+      mfcc_ops.mfccs_from_log_mel_spectrograms(signal)
+
+    # signal must be float32
+    with self.assertRaises(ValueError):
+      signal = array_ops.zeros((2, 3, 5), dtype=dtypes.float64)
+      mfcc_ops.mfccs_from_log_mel_spectrograms(signal)
+
+  def test_basic(self):
+    """A basic test that the op runs on random input."""
+    with spectral_ops_test_util.fft_kernel_label_map():
+      with self.test_session(use_gpu=True):
+        signal = random_ops.random_normal((2, 3, 5))
+        mfcc_ops.mfccs_from_log_mel_spectrograms(signal).eval()
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/signal/python/ops/mfcc_ops.py b/tensorflow/contrib/signal/python/ops/mfcc_ops.py
new file mode 100644
index 0000000000..35b6d3ad45
--- /dev/null
+++ b/tensorflow/contrib/signal/python/ops/mfcc_ops.py
@@ -0,0 +1,137 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Mel-Frequency Cepstral Coefficients (MFCCs) ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import spectral_ops
+
+
+# TODO(rjryan): Remove once tf.spectral.dct exists.
+def _dct2_1d(signals, name=None):
+  """Computes the type II 1D Discrete Cosine Transform (DCT) of `signals`.
+
+  Args:
+    signals: A `[..., samples]` `float32` `Tensor` containing the signals to
+      take the DCT of.
+    name: An optional name for the operation.
+
+  Returns:
+    A `[..., samples]` `float32` `Tensor` containing the DCT of `signals`.
+
+  """
+  with ops.name_scope(name, 'dct', [signals]):
+    # We use the FFT to compute the DCT and TensorFlow only supports float32 for
+    # FFTs at the moment.
+    signals = ops.convert_to_tensor(signals, dtype=dtypes.float32)
+
+    axis_dim = signals.shape[-1].value or array_ops.shape(signals)[-1]
+    axis_dim_float = math_ops.to_float(axis_dim)
+    scale = 2.0 * math_ops.exp(math_ops.complex(
+        0.0, -math.pi * math_ops.range(axis_dim_float) /
+        (2.0 * axis_dim_float)))
+
+    rfft = spectral_ops.rfft(signals, fft_length=[2 * axis_dim])[..., :axis_dim]
+    dct2 = math_ops.real(rfft * scale)
+    return dct2
+
+
+def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
+  """Computes [MFCCs][mfcc] of `log_mel_spectrograms`.
+
+  Implemented with GPU-compatible ops and supports gradients.
+
+  [Mel-Frequency Cepstral Coefficient (MFCC)][mfcc] calculation consists of
+  taking the DCT-II of a log-magnitude mel-scale spectrogram. [HTK][htk]'s MFCCs
+  use a particular scaling of the DCT-II which is almost orthogonal
+  normalization. We follow this convention.
+
+  All `num_mel_bins` MFCCs are returned and it is up to the caller to select
+  a subset of the MFCCs based on their application. For example, it is typical
+  to only use the first few for speech recognition, as this results in
+  an approximately pitch-invariant representation of the signal.
+
+  For example:
+
+  ```python
+  sample_rate = 16000.0
+  # A Tensor of [batch_size, num_samples] mono PCM samples in the range [-1, 1].
+  pcm = tf.placeholder(tf.float32, [None, None])
+
+  # A 1024-point STFT with frames of 64 ms and 75% overlap.
+  stfts = tf.contrib.signal.stft(pcm, frame_length=1024, frame_step=256,
+                                 fft_length=1024)
+  spectrograms = tf.abs(stft)
+
+  # Warp the linear scale spectrograms into the mel-scale.
+  num_spectrogram_bins = stfts.shape[-1].value
+  lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80
+  linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(
+    num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz,
+    upper_edge_hertz)
+  mel_spectrograms = tf.tensordot(
+    spectrograms, linear_to_mel_weight_matrix, 1)
+  mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate(
+    linear_to_mel_weight_matrix.shape[-1:]))
+
+  # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
+  log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6)
+
+  # Compute MFCCs from log_mel_spectrograms and take the first 13.
+  mfccs = tf.contrib.signal.mfccs_from_log_mel_spectrograms(
+    log_mel_spectrograms)[..., :13]
+  ```
+
+  Args:
+    log_mel_spectrograms: A `[..., num_mel_bins]` `float32` `Tensor` of
+      log-magnitude mel-scale spectrograms.
+    name: An optional name for the operation.
+  Returns:
+    A `[..., num_mel_bins]` `float32` `Tensor` of the MFCCs of
+    `log_mel_spectrograms`.
+
+  Raises:
+    ValueError: If `num_mel_bins` is not positive.
+
+  [mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+  [htk]: https://en.wikipedia.org/wiki/HTK_(software)
+  """
+  with ops.name_scope(name, 'mfccs_from_log_mel_spectrograms',
+                      [log_mel_spectrograms]):
+    # Compute the DCT-II of the resulting log-magnitude mel-scale spectrogram.
+    # The DCT used in HTK scales every basis vector by sqrt(2/N), which is the
+    # scaling required for an "orthogonal" DCT-II *except* in the 0th bin, where
+    # the true orthogonal DCT (as implemented by scipy) scales by sqrt(1/N). For
+    # this reason, we don't apply orthogonal normalization and scale the DCT by
+    # `0.5 * sqrt(2/N)` manually.
+    log_mel_spectrograms = ops.convert_to_tensor(log_mel_spectrograms,
+                                                 dtype=dtypes.float32)
+    if (log_mel_spectrograms.shape.ndims and
+        log_mel_spectrograms.shape[-1].value is not None):
+      num_mel_bins = log_mel_spectrograms.shape[-1].value
+      if num_mel_bins == 0:
+        raise ValueError('num_mel_bins must be positive. Got: %s' %
+                         log_mel_spectrograms)
+    else:
+      num_mel_bins = array_ops.shape(log_mel_spectrograms)[-1]
+    return _dct2_1d(log_mel_spectrograms) * math_ops.rsqrt(num_mel_bins * 2.0)
author	RJ Ryan <rjryan@google.com>	2017-10-02 14:26:45 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-10-02 14:36:47 -0700
commit	f94d410c701a9b9e41b3094af0f66bf9490a9838 (patch)
tree	7b41ed4f0bb858b12a15fdfbb90f7007a3876fa4 /tensorflow/contrib/signal
parent	88cdf1f81fa1938c5bb81c5d293fc0ed0758cadc (diff)