/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "tensorflow/core/framework/common_shape_fns.h"
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/shape_inference.h"
#include "tensorflow/core/lib/core/bits.h"

namespace tensorflow {

namespace {

using shape_inference::DimensionHandle;
using shape_inference::InferenceContext;
using shape_inference::ShapeHandle;

Status DecodeWavShapeFn(InferenceContext* c) {
  ShapeHandle unused;
  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));

  DimensionHandle channels_dim;
  int32 desired_channels;
  TF_RETURN_IF_ERROR(c->GetAttr("desired_channels", &desired_channels));
  if (desired_channels == 0) {
    channels_dim = c->UnknownDim();
  } else {
    if (desired_channels < 0) {
      return errors::InvalidArgument("channels must be non-negative, got ",
                                     desired_channels);
    }
    channels_dim = c->MakeDim(desired_channels);
  }
  DimensionHandle samples_dim;
  int32 desired_samples;
  TF_RETURN_IF_ERROR(c->GetAttr("desired_samples", &desired_samples));
  if (desired_samples == 0) {
    samples_dim = c->UnknownDim();
  } else {
    if (desired_samples < 0) {
      return errors::InvalidArgument("samples must be non-negative, got ",
                                     desired_samples);
    }
    samples_dim = c->MakeDim(desired_samples);
  }
  c->set_output(0, c->MakeShape({samples_dim, channels_dim}));
  c->set_output(1, c->Scalar());
  return Status::OK();
}

Status EncodeWavShapeFn(InferenceContext* c) {
  ShapeHandle unused;
  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &unused));
  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));
  c->set_output(0, c->Scalar());
  return Status::OK();
}

Status SpectrogramShapeFn(InferenceContext* c) {
  ShapeHandle input;
  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
  int32 window_size;
  TF_RETURN_IF_ERROR(c->GetAttr("window_size", &window_size));
  int32 stride;
  TF_RETURN_IF_ERROR(c->GetAttr("stride", &stride));

  DimensionHandle input_channels = c->Dim(input, 0);
  DimensionHandle input_length = c->Dim(input, 1);

  DimensionHandle output_length;
  if (!c->ValueKnown(input_length)) {
    output_length = c->UnknownDim();
  } else {
    const int64 input_length_value = c->Value(input_length);
    const int64 length_minus_window = (input_length_value - window_size);
    int64 output_length_value;
    if (length_minus_window < 0) {
      output_length_value = 0;
    } else {
      output_length_value = 1 + (length_minus_window / stride);
    }
    output_length = c->MakeDim(output_length_value);
  }

  DimensionHandle output_channels =
      c->MakeDim(1 + NextPowerOfTwo(window_size) / 2);
  c->set_output(0,
                c->MakeShape({input_channels, output_length, output_channels}));
  return Status::OK();
}

}  // namespace

REGISTER_OP("DecodeWav")
    .Input("contents: string")
    .Attr("desired_channels: int = -1")
    .Attr("desired_samples: int = -1")
    .Output("audio: float")
    .Output("sample_rate: int32")
    .SetShapeFn(DecodeWavShapeFn)
    .Doc(R"doc(
Decode a 16-bit PCM WAV file to a float tensor.

The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.

When desired_channels is set, if the input contains fewer channels than this
then the last channel will be duplicated to give the requested number, else if
the input has more channels than requested then the additional channels will be
ignored.

If desired_samples is set, then the audio will be cropped or padded with zeroes
to the requested length.

The first output contains a Tensor with the content of the audio samples. The
lowest dimension will be the number of channels, and the second will be the
number of samples. For example, a ten-sample-long stereo WAV file should give an
output shape of [10, 2].

contents: The WAV-encoded audio, usually from a file.
desired_channels: Number of sample channels wanted.
desired_samples: Length of audio requested.
audio: 2-D with shape `[length, channels]`.
sample_rate: Scalar holding the sample rate found in the WAV header.
)doc");

REGISTER_OP("EncodeWav")
    .Input("audio: float")
    .Input("sample_rate: int32")
    .Output("contents: string")
    .SetShapeFn(EncodeWavShapeFn)
    .Doc(R"doc(
Encode audio data using the WAV file format.

This operation will generate a string suitable to be saved out to create a .wav
audio file. It will be encoded in the 16-bit PCM format. It takes in float
values in the range -1.0f to 1.0f, and any outside that value will be clamped to
that range.

`audio` is a 2-D float Tensor of shape `[length, channels]`.
`sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).

audio: 2-D with shape `[length, channels]`.
sample_rate: Scalar containing the sample frequency.
contents: 0-D. WAV-encoded file contents.
)doc");

REGISTER_OP("AudioSpectrogram")
    .Input("input: float")
    .Attr("window_size: int")
    .Attr("stride: int")
    .Attr("magnitude_squared: bool = false")
    .Output("spectrogram: float")
    .SetShapeFn(SpectrogramShapeFn)
    .Doc(R"doc(
Produces a visualization of audio data over time.

Spectrograms are a standard way of representing audio information as a series of
slices of frequency information, one slice for each window of time. By joining
these together into a sequence, they form a distinctive fingerprint of the sound
over time.

This op expects to receive audio data as an input, stored as floats in the range
-1 to 1, together with a window width in samples, and a stride specifying how
far to move the window between slices. From this it generates a three
dimensional output. The lowest dimension has an amplitude value for each
frequency during that time slice. The next dimension is time, with successive
frequency slices. The final dimension is for the channels in the input, so a
stereo audio input would have two here for example.

This means the layout when converted and saved as an image is rotated 90 degrees
clockwise from a typical spectrogram. Time is descending down the Y axis, and
the frequency decreases from left to right.

Each value in the result represents the square root of the sum of the real and
imaginary parts of an FFT on the current window of samples. In this way, the
lowest dimension represents the power of each frequency in the current window,
and adjacent windows are concatenated in the next dimension.

To get a more intuitive and visual look at what this operation does, you can run
tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
resulting spectrogram as a PNG image.

input: Float representation of audio data.
window_size: How wide the input window is in samples. For the highest efficiency
  this should be a power of two, but other values are accepted.
stride: How widely apart the center of adjacent sample windows should be.
magnitude_squared: Whether to return the squared magnitude or just the
  magnitude. Using squared magnitude can avoid extra calculations.
spectrogram: 3D representation of the audio frequencies as an image.
)doc");

}  // namespace tensorflow