tensorflow/core/kernels/mkl_matmul_op.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217

/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

// See docs in ../ops/math_ops.cc.

// This file uses MKL CBLAS xGEMM for acceleration of TF Matrix-Matrix
// Multiplication (MatMul) operations.
// We currently register this kernel only for MKL supported data
// types (float, double, complex64, complex128). The macro INTEL_MKL is defined
// by the build system only when MKL is chosen as an option at configure stage
// and when it is undefined at build time, this file becomes an empty
// compilation unit

#if defined(INTEL_MKL)

#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/fill_functor.h"
#include "third_party/mkl/include/mkl_cblas.h"

namespace tensorflow {

typedef Eigen::ThreadPoolDevice CPUDevice;

template <typename Device, typename T, bool USE_CUBLAS>
class MklMatMulOp : public OpKernel {
 public:
  explicit MklMatMulOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
    OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_));
    OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_));
  }

  void Compute(OpKernelContext* ctx) override {
    const Tensor& a = ctx->input(0);
    const Tensor& b = ctx->input(1);

    // Check that the dimensions of the two matrices are valid.
    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a.shape()),
                errors::InvalidArgument("In[0] is not a matrix"));
    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b.shape()),
                errors::InvalidArgument("In[1] is not a matrix"));
    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
    dim_pair[0].first = transpose_a_ ? 0 : 1;
    dim_pair[0].second = transpose_b_ ? 1 : 0;

    OP_REQUIRES(ctx,
                a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
                errors::InvalidArgument("Matrix size-incompatible: In[0]: ",
                                        a.shape().DebugString(), ", In[1]: ",
                                        b.shape().DebugString()));
    int a_dim_remaining = 1 - dim_pair[0].first;
    int b_dim_remaining = 1 - dim_pair[0].second;
    TensorShape out_shape(
        {a.dim_size(a_dim_remaining), b.dim_size(b_dim_remaining)});
    Tensor* out = nullptr;
    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));

    if (out->NumElements() == 0) {
      // If a has shape [0, x] or b has shape [x, 0], the output shape
      // is a 0-element matrix, so there is nothing to do.
      return;
    }

    if (a.NumElements() == 0 || b.NumElements() == 0) {
      // If a has shape [x, 0] and b has shape [0, y], the
      // output shape is [x, y] where x and y are non-zero, so we fill
      // the output with zeros.
      functor::SetZeroFunctor<Device, T> f;
      f(ctx->eigen_device<Device>(), out->flat<T>());
      return;
    }

    const int m = a.dim_size(1 - dim_pair[0].first);
    const int k = a.dim_size(dim_pair[0].first);
    const int n = b.dim_size(1 - dim_pair[0].second);
    bool transpose_a = dim_pair[0].first == 0;
    bool transpose_b = dim_pair[0].second == 1;

    auto a_ptr = (a.template flat<T>().data());
    auto b_ptr = (b.template flat<T>().data());
    auto c_ptr = (out->template flat<T>().data());

    MklBlasGemm(transpose_a, transpose_b, m, n, k, a_ptr, transpose_a ? m : k,
                b_ptr, transpose_b ? k : n, c_ptr, n);
  }

 private:
  bool transpose_a_;
  bool transpose_b_;

  // --------------------------------------------------------------------------
  //
  // @brief Matrix-Matrix Multiplication with FP32 tensors, a, b, c using CBLAS
  // interface. c = op(a) * op(b)
  //
  // @param transa  Specifies the form of op(a) used in MatMul. If transa is
  // true, then op(a) = a^T, otherwise op(a) = a
  //
  // @param transb  Specifies the form of op(b) used in MatMul. If transb is
  // true, then op(b) = b^T, otherwise op(b) = b
  //
  // @param m       Specifies the number of rows of the matrix op(a) and of the
  // matrix c. The value of m must be at least zero.
  //
  // @param n       Specifies the number of columns of the matrix op(b) and the
  // number of columns of the matrix c. The value of n must be at least zero.
  //
  // @param k       Specifies the number of columns of the matrix op(a) and the
  // number of rows of the matrix op(b)
  //
  // @param a       Address of matrix a
  //
  // @param lda     Leading dimension of 'a' matrix. This is set at calling site
  // depending on transa parameter. Since TF uses row-major
  // layout, leading dimension is the stride between consecutive rows
  // lda = max(1,k) when transa is false, otherwise lda = max(1,m)
  //
  // @param b       Address of matrix b
  //
  // @param ldb     Leading dimension of 'b' matrix. This is set at calling site
  // depending on transb parameter. Since TF uses row-major
  // layout, leading dimension is the stride between consecutive rows
  // ldb = max(1,n) when transb is false, otherwise ldb = max(1,k)
  //
  // @param c       Address of matrix c
  //
  // @param ldc     Leading dimension of 'c' matrix. Since TF uses row-major
  // layout, leading dimension is the stride between consecutive rows, max(1,n)
  //
  // --------------------------------------------------------------------------
  void MklBlasGemm(bool transa, bool transb, const int m, const int n,
                   const int k, const float* a, const int lda, const float* b,
                   const int ldb, float* c, const int ldc) {
    // BLAS GEMM API defines Matrix Multiplication as c = alpha * op(a) * op(b)
    // + beta * c.
    // Since TF MatMul does not have parameters for alpha, beta, we set them to
    // 1.0 and 0.0 respectively.
    const float alpha = 1.0f;
    const float beta = 0.0f;
    cblas_sgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
                transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a, lda, b,
                ldb, beta, c, ldc);
  }

  // Matrix-Matrix Multiplication with FP64 tensors. For detailed info about
  // parameters, look at FP32 function description.
  void MklBlasGemm(bool transa, bool transb, const int m, const int n,
                   const int k, const double* a, const int lda, const double* b,
                   const int ldb, double* c, const int ldc) {
    const double alpha = 1.0;
    const double beta = 0.0;
    cblas_dgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
                transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a, lda, b,
                ldb, beta, c, ldc);
  }

  // Matrix-Matrix Multiplication with Complex64 (std::complex<float>) tensors.
  // For detailed info about parameters, look at FP32 function description.
  void MklBlasGemm(bool transa, bool transb, const int m, const int n,
                   const int k, const std::complex<float>* a, const int lda,
                   const std::complex<float>* b, const int ldb,
                   std::complex<float>* c, int const ldc) {
    const MKL_Complex8 alpha = {1.0f, 0.0f};
    const MKL_Complex8 beta = {0.0f, 0.0f};
    cblas_cgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
                transb ? CblasTrans : CblasNoTrans, m, n, k,
                static_cast<const void*>(&alpha), static_cast<const void*>(a),
                lda, static_cast<const void*>(b), ldb,
                static_cast<const void*>(&beta), static_cast<void*>(c), ldc);
  }

  // Matrix-Matrix Multiplication with Complex128 (std::complex<double>)
  // tensors. For detailed info about parameters, look at FP32 function
  // description.
  void MklBlasGemm(bool transa, bool transb, const int m, const int n,
                   const int k, const std::complex<double>* a, const int lda,
                   const std::complex<double>* b, const int ldb,
                   std::complex<double>* c, const int ldc) {
    const MKL_Complex16 alpha = {1.0, 0.0};
    const MKL_Complex16 beta = {0.0, 0.0};
    cblas_zgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
                transb ? CblasTrans : CblasNoTrans, m, n, k,
                static_cast<const void*>(&alpha), static_cast<const void*>(a),
                lda, static_cast<const void*>(b), ldb,
                static_cast<const void*>(&beta), static_cast<void*>(c), ldc);
  }
};

#define REGISTER_CPU(T)                                                      \
  REGISTER_KERNEL_BUILDER(                                                   \
      Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"),              \
      MklMatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>);       \
  REGISTER_KERNEL_BUILDER(                                                   \
      Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T").Label("MKL"), \
      MklMatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>)

// TODO:Consider template specialization when adding/removing additional types
TF_CALL_float(REGISTER_CPU);
TF_CALL_double(REGISTER_CPU);
TF_CALL_complex64(REGISTER_CPU);
TF_CALL_complex128(REGISTER_CPU);

}  // namespace tensorflow
#endif  // INTEL_MKL