# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Benchmark for Matmul operator."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import itertools
import time

import numpy as np

from tensorflow.python.client import session as session_lib
from tensorflow.python.framework import ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import random_ops
from tensorflow.python.ops import variables
from tensorflow.python.platform import test


def build_graph(device, n, m, k, transpose_a, transpose_b, dtype):
  """Build a graph containing a sequence of matmul operations.

  Args:
    device: String, the device to run on.
    n: tensor A's first dimension size.
    m: tensor A's second dimension size.
    k: tensor B's second dimension size.
    transpose_a: boolean value to show if tensor A is transposed.
    transpose_b: boolean value to show if tensor B is transposed.
    dtype: numpy data type of the input tensor.

  Returns:
    A matmul operation to run()
  """
  with ops.device('%s' % device):
    if not transpose_a:
      x = variables.VariableV1(random_ops.random_uniform([n, m], dtype=dtype))
    else:
      x = variables.VariableV1(random_ops.random_uniform([m, n], dtype=dtype))
    if not transpose_b:
      y = variables.VariableV1(random_ops.random_uniform([m, k], dtype=dtype))
    else:
      y = variables.VariableV1(random_ops.random_uniform([k, m], dtype=dtype))

    z = math_ops.matmul(x, y, transpose_a=transpose_a, transpose_b=transpose_b)
    return control_flow_ops.group(z)


class MatmulBenchmark(test.Benchmark):
  """Benchmark matmul!"""

  def run_graph(self, device, n, m, k, transpose_a, transpose_b, num_iters,
                dtype):
    """Run the graph and print its execution time.

    Args:
      device: String, the device to run on.
      n: tensor A's first dimension size.
      m: tensor A's second dimension size.
      k: tensor B's second dimension size.
      transpose_a: boolean value to show if tensor A is transposed.
      transpose_b: boolean value to show if tensor B is transposed.
      num_iters: number of iterations to run the benchmark.
      dtype: numpy data type of the input tensor.

    Returns:
      The duration of the run in seconds.
    """
    graph = ops.Graph()
    with graph.as_default():
      output = build_graph(device, n, m, k, transpose_a, transpose_b, dtype)
      with session_lib.Session(graph=graph) as session:
        variables.global_variables_initializer().run()
        for _ in range(500):
          session.run(output)
        start_time = time.time()
        for _ in range(num_iters):
          session.run(output)
        duration = (time.time() - start_time)
        num_items = n * m * k * 2
        throughput = num_items * num_iters / duration / 1e9
        print('%s %s input_info:%s %d %.4fsec, %.4fGitems/s.' %
              (device, str(dtype), str(n) + 'x' + str(m) + 'x' + str(k) +
               ',ta:' + str(transpose_a) + '.tb:' + str(transpose_b), num_iters,
               duration, throughput))

    name_template = ('matmul_{device}_{dtype}_input_info_{inputinfo}')

    self.report_benchmark(
        name=name_template.format(
            device=device,
            dtype=str(dtype).replace(' ', ''),
            inputinfo=str(n) + 'x' + str(m) + 'x' + str(k) + ',ta:' +
            str(transpose_a) + ',tb:' + str(transpose_b)).replace(' ', ''),
        iters=num_iters,
        wall_time=duration)
    return duration

  def run_test_gpu(self, n, m, k, transpose_a, transpose_b, dtype, num_iters):
    self.run_graph(test.gpu_device_name(), n, m, k, transpose_a, transpose_b,
                   num_iters, dtype)

  def test_round(self, num_iters):
    dtypes = [np.float32, np.float64]
    for dtype in dtypes:
      for n, m, (transpose_a, transpose_b) in itertools.product(
          [512, 1024], [1, 8, 16, 128], [(False, False), (True, False),
                                         (False, True)]):
        k = n
        self.run_test_gpu(n, m, k, transpose_a, transpose_b, dtype, num_iters)

      for n, m, k, (transpose_a, transpose_b) in itertools.product(
          [200], [1, 8, 20], [10000], [(False, False), (True, False),
                                       (False, True)]):
        self.run_test_gpu(n, m, k, transpose_a, transpose_b, dtype, num_iters)

      for (n, m, k), (transpose_a, transpose_b) in itertools.product(
          [(200, 20, 20000), (1, 10000, 200)], [(False, False), (True, False),
                                                (False, True)]):
        self.run_test_gpu(n, m, k, transpose_a, transpose_b, dtype, num_iters)

  def benchmark_matmul(self):
    self.test_round(num_iters=200)


if __name__ == '__main__':
  test.main()