2 files changed, 308 insertions, 20 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h
new file mode 100644
index 000000000..a97f043c1
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h
@@ -0,0 +1,116 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H
+
+#if defined(EIGEN_USE_MKLDNN)
+// Support for MklDnn sgemm kernel in Tensor contractions:
+//
+// 1. Prepare packed Lhs/Rhs blocks from tensor expressions using
+//    DataMapper (see TensorContractionInputMapper).
+// 2. Invoke gemm kernel with packed blocks (replacement for default
+// gebp_kernel).
+
+namespace Eigen {
+namespace internal {
+
+template <typename Scalar, typename StorageIndex, typename DataMapper,
+          int StorageOrder>
+struct mkldnn_gemm_pack;
+
+// mkl_gemm_pack for ColMajor storage order.
+template <typename Scalar, typename StorageIndex, typename DataMapper>
+struct mkldnn_gemm_pack<Scalar, StorageIndex, DataMapper,
+                        /*StorageOrder*/ ColMajor> {
+  typedef typename internal::packet_traits<Scalar>::type Packet;
+  typedef typename DataMapper::LinearMapper LinearMapper;
+
+  enum { PacketSize = internal::packet_traits<Scalar>::size };
+
+  EIGEN_DONT_INLINE
+  void operator()(Scalar *block, const DataMapper &data_mapper,
+                  StorageIndex rows, StorageIndex cols) {
+    const StorageIndex unrolled_rows =
+        (rows / (4 * PacketSize)) * (4 * PacketSize);
+    const StorageIndex vectorized_rows = (rows / PacketSize) * PacketSize;
+
+    for (StorageIndex col = 0; col < cols; ++col) {
+      LinearMapper lm = data_mapper.getLinearMapper(0, col);
+
+      // Give compiler a strong possibility to unroll the loop.
+      for (StorageIndex i = 0; i < unrolled_rows; i += 4 * PacketSize) {
+        for (StorageIndex j = 0; j < 4; ++j) {
+          const Packet p = lm.template loadPacket<Packet>(i + j * PacketSize);
+          internal::pstoreu(block + j * PacketSize, p);
+        }
+        block += 4 * PacketSize;
+      }
+
+      // Process remaining rows with packets.
+      for (StorageIndex i = unrolled_rows; i < vectorized_rows;
+           i += PacketSize) {
+        const Packet p = lm.template loadPacket<Packet>(i);
+        internal::pstoreu(block, p);
+        block += PacketSize;
+      }
+
+      // Finalize with coefficients.
+      for (StorageIndex i = vectorized_rows; i < rows; ++i) {
+        *block = lm(i);
+        ++block;
+      }
+    }
+  }
+};
+
+template <typename Scalar, typename StorageIndex, typename OutputMapper,
+          bool ConjugateLhs = false, bool ConjugateRhs = false>
+struct mkldnn_gemm_kernel;
+
+// mkldnn_gemm_kernel for floats defined as a thin layer on top of mkldnn_sgemm.
+template <typename StorageIndex, typename OutputMapper, bool ConjugateLhs,
+          bool ConjugateRhs>
+struct mkldnn_gemm_kernel</*Scalar*/ float, StorageIndex, OutputMapper,
+                          ConjugateLhs, ConjugateRhs> {
+  EIGEN_DONT_INLINE
+  void operator()(const OutputMapper &output, const float *blockA,
+                  const float *blockB, const StorageIndex rows,
+                  const StorageIndex depth, const StorageIndex cols,
+                  float alpha) {
+    static const int max_index = (std::numeric_limits<int>::max)();
+
+    eigen_assert(max_index > rows);
+    eigen_assert(max_index > cols);
+    eigen_assert(max_index > depth);
+    eigen_assert(max_index > output.stride());
+
+    const int m = static_cast<int>(rows);
+    const int n = static_cast<int>(cols);
+    const int k = static_cast<int>(depth);
+
+    const char transposeA = ConjugateLhs ? 'Y' : 'N';
+    const char transposeB = ConjugateRhs ? 'Y' : 'N';
+
+    const int ldA = ConjugateLhs ? k : m;
+    const int ldB = ConjugateRhs ? n : k;
+    const int ldC = static_cast<int>(output.stride());
+
+    const float beta = 1.0;
+
+    mkldnn_status_t st = mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k,
+                                      &alpha, blockA, &ldA, blockB, &ldB, &beta,
+                                      const_cast<float*>(output.data()), &ldC);
+    eigen_assert(st == 0);
+  }
+};
+
+}  // namespace internal
+}  // namespace Eigen
+#endif  // EIGEN_USE_MKLDNN
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 0980854b4..0c4d2f0bf 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -15,6 +15,177 @@
 
 namespace Eigen {
 
+namespace internal {
+
+// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in
+// ColMajor storage order. This property is guaranteed by the
+// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack
+// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix
+// multiplication for these blocks. Default tensor contraction uses
+// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see
+// GeneralBlocPanelKernel.h for details).
+//
+// By specializing contraction kernels we can use other low level libraries to
+// perform matrix multiplication, and still rely on Eigen thread pool evaluator
+// for scaling. Assumption is that custom gemm do not use it's own threading for
+// parallelisation.
+//
+// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of
+//   multiplication, lhs tensor and rhs tensor respectively.
+//
+// - StorageIndex - index type for the tensor expressions. In practice almost
+//   always is Eigen::Index.
+//
+// - OutputMapper provides access to the memory of the output matrix. In
+//   practice it's always column major blas_data_mapper (it must be of ResScalar
+//   type).
+//
+// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional
+//   view into the Lhs/Rhs tensor expressions. In practice it's
+//   TensorContractionInputMapper, or some specialization of it based on the
+//   type of tensor expression (e.g. TensorImagePatchOp has optimized input
+//   mapper).
+//
+// TODO(ezhulenev): Use TensorContractionKernel in default tensor contraction
+// evaluator.
+template<typename ResScalar, typename LhsScalar, typename RhsScalar,
+    typename StorageIndex, typename OutputMapper, typename LhsMapper,
+    typename RhsMapper>
+struct TensorContractionKernel {
+  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+  typedef internal::gemm_pack_lhs<LhsScalar, StorageIndex,
+                                  typename LhsMapper::SubMapper,
+                                  Traits::mr, Traits::LhsProgress,
+                                  typename Traits::LhsPacket4Packing, ColMajor>
+      LhsPacker;
+
+  typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex,
+                                  typename RhsMapper::SubMapper, Traits::nr,
+                                  ColMajor>
+      RhsPacker;
+
+  typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex,
+                                OutputMapper, Traits::mr, Traits::nr,
+      /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
+      GebpKernel;
+
+  EIGEN_DONT_INLINE
+  static void packLhs(LhsScalar* lhsBlock,
+                      const typename LhsMapper::SubMapper& data_mapper,
+                      const StorageIndex depth, const StorageIndex rows) {
+    LhsPacker()(lhsBlock, data_mapper, depth, rows);
+  }
+
+  EIGEN_DONT_INLINE
+  static void packRhs(RhsScalar* rhsBlock,
+                      const typename RhsMapper::SubMapper& data_mapper,
+                      const StorageIndex depth, const StorageIndex cols) {
+    RhsPacker()(rhsBlock, data_mapper, depth, cols);
+  }
+
+  EIGEN_DONT_INLINE
+  static void invoke(const OutputMapper& output_mapper,
+                     const LhsScalar* lhsBlock, const RhsScalar* rhsBlock,
+                     const StorageIndex rows, const StorageIndex depth,
+                     const StorageIndex cols, const ResScalar alpha) {
+    GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
+        /*strideA*/ -1, /*strideB*/ -1,
+        /*offsetA*/ 0, /*offsetB*/ 0);
+  }
+};
+
+// Some tensor contraction kernels might rely on the gemm libraries that are
+// optimized for a specific dimension sizes. By default Eigen picks block
+// sizes to fit the working set in the L1/L2 caches, by specializing we can
+// refine this choice and round up these sizes to work well with underlying gemm
+// library.
+// TODO(ezhulenev): Move it to TensorContractionBlocking, or keep separate?
+template<typename ResScalar, typename LhsScalar, typename RhsScalar,
+    typename StorageIndex>
+struct TensorContractionKernelBlocking {
+  static void refine(const StorageIndex /*m*/,
+                     const StorageIndex /*n*/,
+                     const StorageIndex /*k*/,
+                     StorageIndex* /*bm*/,
+                     StorageIndex* /*bn*/,
+                     StorageIndex* /*bk*/) {
+    // By default we do nothing and stick to the block sizes picked by Eigen.
+  }
+};
+
+#if defined(EIGEN_USE_MKLDNN)
+// If all scalar types in tensor contraction are floats, we can use mkldnn gemm
+// as our low level kernel.
+template<typename StorageIndex, typename OutputMapper, typename LhsMapper,
+    typename RhsMapper>
+struct TensorContractionKernel<float, float, float, StorageIndex, OutputMapper,
+                               LhsMapper, RhsMapper> {
+  // For now mkldnn has only mkldnn_sgemm (gemm for floats).
+  typedef float Scalar;
+
+  typedef typename internal::gebp_traits<Scalar, Scalar> Traits;
+
+  typedef internal::mkldnn_gemm_pack<Scalar, StorageIndex,
+                                     typename LhsMapper::SubMapper, ColMajor>
+      LhsPacker;
+
+  typedef internal::mkldnn_gemm_pack<Scalar, StorageIndex,
+                                     typename RhsMapper::SubMapper, ColMajor>
+      RhsPacker;
+
+  typedef internal::mkldnn_gemm_kernel<Scalar, StorageIndex, OutputMapper>
+      GemmKernel;
+
+  EIGEN_DONT_INLINE
+  static void packLhs(Scalar* lhsBlock,
+                      const typename LhsMapper::SubMapper& data_mapper,
+                      StorageIndex depth, StorageIndex rows) {
+    LhsPacker()(lhsBlock, data_mapper, rows, depth);
+  }
+
+  EIGEN_DONT_INLINE
+  static void packRhs(Scalar* rhsBlock,
+                      const typename RhsMapper::SubMapper& data_mapper,
+                      const StorageIndex depth, const StorageIndex cols) {
+    RhsPacker()(rhsBlock, data_mapper, depth, cols);
+  }
+
+  EIGEN_DONT_INLINE
+  static void invoke(const OutputMapper& output_mapper, const Scalar* lhsBlock,
+                     const Scalar* rhsBlock, const StorageIndex rows,
+                     const StorageIndex depth, const StorageIndex cols,
+                     const Scalar alpha) {
+    GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha);
+  }
+};
+
+// For mkldnn_sgemm having the right dimensions (especially for small matrices)
+// is more important than fitting all the working set in L1/L2 caches.
+template<typename StorageIndex>
+struct TensorContractionKernelBlocking<float, float, float, StorageIndex> {
+  // Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48. We pick the largest.
+  static const StorageIndex kUnrollM = 48;
+  // Mkldnn Avx/Avx2/Avx512 unroll factors are: 6/6/8. We pick the closest
+  // number that divides to both of them.
+  static const StorageIndex kUnrollN = 24;
+
+  static void refine(const StorageIndex m,
+                     const StorageIndex n,
+                     const StorageIndex /*k*/,
+                     StorageIndex* bm,
+                     StorageIndex* bn,
+                     StorageIndex* /*bk*/) {
+    // TODO(ezhulenev): There is probably a better way to pick block sizes.
+    *bm = (std::min)(m, Eigen::divup(*bm, kUnrollM) * kUnrollM);
+    *bn = (std::min)(n, Eigen::divup(*bn, kUnrollN) * kUnrollN);
+    // Stick with default bk.
+  }
+};
+
+#endif  // EIGEN_USE_MKLDNN
+} // namespace internal
+
 template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
 struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> :
     public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> > {
@@ -175,6 +346,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       bn = blocking.nc();
       bk = blocking.kc();
     }
+    // Refine blocking choice to work well with contraction kernel.
+    internal::TensorContractionKernelBlocking<Scalar, LhsScalar, RhsScalar,
+                                              Index>::refine(m, n, k, &bm,
+                                                             &bn, &bk);
 
     // Number of kernels for each dimension.
     Index nm0 = divup(m, bm);
@@ -242,17 +417,12 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
         contract_t, internal::packet_traits<RhsScalar>::size,
         rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
         RhsMapper;
-    typedef internal::gemm_pack_lhs<LhsScalar, Index,
-                                    typename LhsMapper::SubMapper, Traits::mr,
-                                    Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor>
-        LhsPacker;
-    typedef internal::gemm_pack_rhs<
-        RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
-        RhsPacker;
+
     typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
-                                  Traits::mr, Traits::nr, false, false>
-        GebpKernel;
+
+    typedef internal::TensorContractionKernel<
+        Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
+        TensorContractionKernel;
 
     Context(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn,
             Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk,
@@ -434,8 +604,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     void pack_lhs(Index m, Index k) {
       const Index mend = m * gm_ + gm(m);
       for (Index m1 = m * gm_; m1 < mend; m1++)
-        LhsPacker()(packed_lhs_[k % (P - 1)][m1],
-                    lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
+        TensorContractionKernel::packLhs(packed_lhs_[k % (P - 1)][m1],
+                                         lhs_.getSubMapper(m1 * bm_, k * bk_),
+                                         bk(k), bm(m1));
 
       if (!parallel_pack_ && shard_by_col_) {
         signal_packing(k);
@@ -458,8 +629,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
           // deadlocks.
           memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
         }
-        RhsPacker()(packed_rhs_[k % (P - 1)][n1],
-                    rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
+        TensorContractionKernel::packRhs(packed_rhs_[k % (P - 1)][n1],
+                                         rhs_.getSubMapper(k * bk_, n1 * bn_),
+                                         bk(k), bn(n1));
       }
 
       if (parallel_pack_ || shard_by_col_) {
@@ -480,9 +652,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
         for (Index n1 = n * gn_; n1 < nend; n1++) {
           for (Index m1 = m * gm_; m1 < mend; m1++) {
             const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
-            GebpKernel()(output_mapper, packed_lhs_[k % (P - 1)][m1],
-                         packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
-                         Scalar(1), -1, -1, 0, 0);
+            TensorContractionKernel::invoke(
+                output_mapper, packed_lhs_[k % (P - 1)][m1],
+                packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1));
 
             // We are done with the last task for the [m1, n1] block.
             if (k + 1 == nk_) {
@@ -495,9 +667,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
         for (Index m1 = m * gm_; m1 < mend; m1++)
           for (Index n1 = n * gn_; n1 < nend; n1++) {
             const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
-            GebpKernel()(output_mapper, packed_lhs_[k % (P - 1)][m1],
-                         packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
-                         Scalar(1), -1, -1, 0, 0);
+            TensorContractionKernel::invoke(
+                output_mapper, packed_lhs_[k % (P - 1)][m1],
+                packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1));
 
             // We are done with the last task for the [m1, n1] block.
             if (k + 1 == nk_) {