aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
blob: b1b5126d9e9e5196a1733b80e0778e53cef7f774 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
"""Encoding and decoding audio using FFmpeg."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from tensorflow.contrib.ffmpeg.ops import gen_decode_audio_op_py
from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
from tensorflow.contrib.ffmpeg.ops import gen_encode_audio_op_py
from tensorflow.contrib.util import loader
from tensorflow.python.framework import ops
from tensorflow.python.platform import resource_loader

_ffmpeg_so = loader.load_op_library(
    resource_loader.get_path_to_datafile('ffmpeg.so'))


def decode_audio(contents, file_format=None, samples_per_second=None,
                 channel_count=None, stream=None):
  """Create an op that decodes the contents of an audio file.

  Note that ffmpeg is free to select the "best" audio track from an mp4.
  https://trac.ffmpeg.org/wiki/Map

  Args:
    contents: The binary contents of the audio file to decode. This is a
        scalar.
    file_format: A string or scalar string tensor specifying which
        format the contents will conform to. This can be mp3, mp4, ogg,
        or wav.
    samples_per_second: The number of samples per second that is
        assumed, as an `int` or scalar `int32` tensor. In some cases,
        resampling will occur to generate the correct sample rate.
    channel_count: The number of channels that should be created from the
        audio contents, as an `int` or scalar `int32` tensor. If the
        `contents` have more than this number, then some channels will
        be merged or dropped. If `contents` has fewer than this, then
        additional channels will be created from the existing ones.
    stream: A string specifying which stream from the content file
        should be decoded, e.g., '0' means the 0-th stream.
        The default value is '' which leaves the decision to ffmpeg.

  Returns:
    A rank-2 tensor that has time along dimension 0 and channels along
    dimension 1. Dimension 0 will be `samples_per_second *
    length_in_seconds` wide, and dimension 1 will be `channel_count`
    wide. If ffmpeg fails to decode the audio then an empty tensor will
    be returned.
  """
  return gen_decode_audio_op_py.decode_audio_v2(
      contents, file_format=file_format, samples_per_second=samples_per_second,
      channel_count=channel_count, stream=stream)


ops.NotDifferentiable('DecodeAudio')


def encode_audio(audio, file_format=None, samples_per_second=None):
  """Creates an op that encodes an audio file using sampled audio from a tensor.

  Args:
    audio: A rank-2 `Tensor` that has time along dimension 0 and
        channels along dimension 1. Dimension 0 is `samples_per_second *
        length_in_seconds` long.
    file_format: The type of file to encode, as a string or rank-0
        string tensor. "wav" is the only supported format.
    samples_per_second: The number of samples in the audio tensor per
        second of audio, as an `int` or rank-0 `int32` tensor.

  Returns:
    A scalar tensor that contains the encoded audio in the specified file
    format.
  """
  return gen_encode_audio_op_py.encode_audio_v2(
      audio,
      file_format=file_format,
      samples_per_second=samples_per_second,
      bits_per_second=192000)  # not used by WAV


ops.NotDifferentiable('EncodeAudio')


def decode_video(contents):
  """Create an op that decodes the contents of a video file.

  Args:
    contents: The binary contents of the video file to decode. This is a
      scalar.

  Returns:
    A rank-4 `Tensor` that has `[frames, height, width, 3]` RGB as output.
  """
  return gen_decode_video_op_py.decode_video(contents)


ops.NotDifferentiable('DecodeVideo')