8 files changed, 113 insertions, 504 deletions
diff --git a/cmake/FindMkldnn.cmake b/cmake/FindMkldnn.cmake
deleted file mode 100644
index 6acba9d8c..000000000
--- a/cmake/FindMkldnn.cmake
+++ /dev/null
@@ -1,18 +0,0 @@
-# Intel mkl-dnn support.
-# Link: https://github.com/intel/mkl-dnn
-if (MKLDNN)
-    set(MKLDNN_FIND_QUIETLY TRUE)
-    set(MKLDNN_INCLUDES ${MKLDNN}/include)
-    set(MKLDNN_LIBRARIES ${MKLDNN}/lib)
-endif (MKLDNN)
-find_path(MKLDNN
-        NAMES
-        mkldnn.h
-        PATHS
-        $ENV{MKLDNNDIR}/include
-        ${INCLUDE_INSTALL_DIR}
-        )
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(MKLDNN DEFAULT_MSG
-        MKLDNN)
-mark_as_advanced(MKLDNN)
-\ No newline at end of file
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 420afe650..14a2d0f0d 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -75,10 +75,6 @@ typedef unsigned __int64 uint64_t;
 #include "libxsmm.h"
 #endif
 
-#if defined(EIGEN_USE_MKLDNN)
-#include "mkldnn.h"
-#endif
-
 #ifdef EIGEN_USE_THREADS
 #include "ThreadPool"
 #endif
@@ -125,7 +121,6 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorArgMax.h"
 #include "src/Tensor/TensorConcatenation.h"
 #include "src/Tensor/TensorContractionMapper.h"
-#include "src/Tensor/TensorContractionMkldnn.h"
 #include "src/Tensor/TensorContractionBlocking.h"
 #include "src/Tensor/TensorContraction.h"
 #include "src/Tensor/TensorContractionThreadPool.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 3b22e43e7..d220f82be 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -136,6 +136,81 @@ struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_,
   static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value;
 };
 
+// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in
+// ColMajor storage order. This property is guaranteed by the
+// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack
+// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix
+// multiplication for these blocks. Default tensor contraction uses
+// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see
+// GeneralBlocPanelKernel.h for details).
+//
+// By specializing contraction kernels we can use other low level libraries to
+// perform matrix multiplication, and still rely on Eigen contraction evaluator.
+// This also includes full support in TensorContractionThreadPool, assuming that
+// underlying gemm do not use it's own threading.
+//
+// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of
+//   multiplication, lhs tensor and rhs tensor respectively.
+//
+// - StorageIndex - index type for the tensor expressions. In practice almost
+//   always is Eigen::Index.
+//
+// - OutputMapper provides access to the memory of the output matrix. In
+//   practice it's always column major blas_data_mapper (it must be of ResScalar
+//   type).
+//
+// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional
+//   view into the Lhs/Rhs tensor expressions. In practice it's
+//   TensorContractionInputMapper, or some specialization of it based on the
+//   type of tensor expression (e.g. TensorImagePatchOp has optimized input
+//   mapper).
+template<typename ResScalar, typename LhsScalar, typename RhsScalar,
+    typename StorageIndex, typename OutputMapper, typename LhsMapper,
+    typename RhsMapper>
+struct TensorContractionKernel {
+  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+  typedef internal::gemm_pack_lhs<LhsScalar, StorageIndex,
+                                  typename LhsMapper::SubMapper,
+                                  Traits::mr, Traits::LhsProgress,
+                                  typename Traits::LhsPacket4Packing, ColMajor>
+      LhsPacker;
+
+  typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex,
+                                  typename RhsMapper::SubMapper, Traits::nr,
+                                  ColMajor>
+      RhsPacker;
+
+  typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex,
+                                OutputMapper, Traits::mr, Traits::nr,
+      /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
+      GebpKernel;
+
+  EIGEN_DONT_INLINE
+  static void packLhs(LhsScalar* lhsBlock,
+                      const typename LhsMapper::SubMapper& data_mapper,
+                      const StorageIndex depth, const StorageIndex rows) {
+    LhsPacker()(lhsBlock, data_mapper, depth, rows, /*stride*/ 0, /*offset*/ 0);
+  }
+
+  EIGEN_DONT_INLINE
+  static void packRhs(RhsScalar* rhsBlock,
+                      const typename RhsMapper::SubMapper& data_mapper,
+                      const StorageIndex depth, const StorageIndex cols) {
+    RhsPacker()(rhsBlock, data_mapper, depth, cols);
+  }
+
+  EIGEN_DONT_INLINE
+  static void invoke(const OutputMapper& output_mapper,
+                     const LhsScalar* lhsBlock, const RhsScalar* rhsBlock,
+                     const StorageIndex rows, const StorageIndex depth,
+                     const StorageIndex cols, const ResScalar alpha) {
+    GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
+        /*strideA*/ -1, /*strideB*/ -1,
+        /*offsetA*/ 0, /*offsetB*/ 0);
+  }
+};
+
 }  // end namespace internal
 
 // Tensor contraction params that should enable to get from output matrix
@@ -591,13 +666,9 @@ struct TensorContractionEvaluatorBase
     // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
     this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
 
-    // define mr, nr, and all of my data mapper types
+    // define data mappers for Lhs and Rhs
     typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
     typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-    typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
-
-    const Index nr = Traits::nr;
-    const Index mr = Traits::mr;
 
     typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
     typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
@@ -619,11 +690,9 @@ struct TensorContractionEvaluatorBase
 
     typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
 
-    // Declare GEBP packing and kernel structs
-    internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor> pack_lhs;
-    internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
-
-    internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
+    typedef internal::TensorContractionKernel<
+        Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
+        TensorContractionKernel;
 
     // initialize data mappers
     LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
@@ -635,7 +704,7 @@ struct TensorContractionEvaluatorBase
     OutputMapper output(buffer, m);
 
     // Sizes of the blocks to load in cache. See the Goto paper for details.
-    internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(k, m, n, 1);
+    internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(k, m, n, 1);
     const Index kc = blocking.kc();
     const Index mc = numext::mini(m, blocking.mc());
     const Index nc = numext::mini(n, blocking.nc());
@@ -651,19 +720,22 @@ struct TensorContractionEvaluatorBase
       for (Index k2 = 0; k2 < k; k2 += kc) {
         // make sure we don't overshoot right edge of left matrix, then pack vertical panel
         const Index actual_kc = numext::mini(k2 + kc, k) - k2;
-        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
+        TensorContractionKernel::packLhs(blockA, lhs.getSubMapper(i2, k2),
+                                         actual_kc, actual_mc);
 
         // series of horizontal blocks
         for (Index j2 = 0; j2 < n; j2 += nc) {
           // make sure we don't overshoot right edge of right matrix, then pack block
           const Index actual_nc = numext::mini(j2 + nc, n) - j2;
-          pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
+          TensorContractionKernel::packRhs(blockB, rhs.getSubMapper(k2, j2),
+                                           actual_kc, actual_nc);
 
           // call gebp (matrix kernel)
           // The parameters here are copied from Eigen's GEMM implementation
           const OutputMapper output_mapper = output.getSubMapper(i2, j2);
-          gebp(output_mapper, blockA, blockB, actual_mc, actual_kc, actual_nc,
-               Scalar(1), -1, -1, 0, 0);
+          TensorContractionKernel::invoke(output_mapper, blockA, blockB,
+                                          actual_mc, actual_kc, actual_nc,
+                                          Scalar(1));
 
           // We are done with this [i2, j2] output block.
           if (k2 + kc >= k) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index cf281192c..71fd19774 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -21,7 +21,7 @@ enum {
 
 
 // Default Blocking Strategy
-template <typename LhsScalar, typename RhsScalar, typename Index, int ShardingType=ShardByCol>
+template<typename ResScalar, typename LhsScalar, typename RhsScalar, typename StorageIndex, int ShardingType = ShardByCol>
 class TensorContractionBlocking {
  public:
 
@@ -42,7 +42,7 @@ class TensorContractionBlocking {
   #if !defined(EIGEN_HIPCC)
   EIGEN_DEVICE_FUNC
   #endif
- TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
+ TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1) :
       kc_(k), mc_(m), nc_(n)
   {
     if (ShardingType == ShardByCol) {
@@ -53,23 +53,23 @@ class TensorContractionBlocking {
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
 
  private:
-  Index kc_;
-  Index mc_;
-  Index nc_;
+  StorageIndex kc_;
+  StorageIndex mc_;
+  StorageIndex nc_;
 };
 
 
 
 #if defined(EIGEN_USE_LIBXSMM)
-template <typename LhsScalar, typename RhsScalar, typename Index>
+template <typename LhsScalar, typename RhsScalar, typename StorageIndex>
 class TensorXsmmContractionBlocking {
  public:
-  TensorXsmmContractionBlocking(Index k, Index m, Index n,
+  TensorXsmmContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n,
       size_t max_num_threads = 1, bool transposeA = false,
       bool transposeB = false):
     k_(k), m_(m), n_(n), transposeA_(transposeA),
@@ -164,28 +164,28 @@ class TensorXsmmContractionBlocking {
     eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n);
   }
 
-  EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
-  EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
-  EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
-  EIGEN_ALWAYS_INLINE Index outer_k() const { return outer_k_; }
-  EIGEN_ALWAYS_INLINE Index outer_m() const { return outer_m_; }
-  EIGEN_ALWAYS_INLINE Index outer_n() const { return outer_n_; }
+  EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
+  EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
+  EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
+  EIGEN_ALWAYS_INLINE StorageIndex outer_k() const { return outer_k_; }
+  EIGEN_ALWAYS_INLINE StorageIndex outer_m() const { return outer_m_; }
+  EIGEN_ALWAYS_INLINE StorageIndex outer_n() const { return outer_n_; }
   EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; }
   EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; }
   EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; }
   EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; }
   EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; }
-  EIGEN_ALWAYS_INLINE Index blocks_m() const { return divup(m_, mc_); }
-  EIGEN_ALWAYS_INLINE Index blocks_k() const { return divup(k_, kc_); }
-  EIGEN_ALWAYS_INLINE Index blocks_n() const { return divup(n_, nc_); }
+  EIGEN_ALWAYS_INLINE StorageIndex blocks_m() const { return divup(m_, mc_); }
+  EIGEN_ALWAYS_INLINE StorageIndex blocks_k() const { return divup(k_, kc_); }
+  EIGEN_ALWAYS_INLINE StorageIndex blocks_n() const { return divup(n_, nc_); }
   EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const {
     return prefetch_;
   }
 
  private:
-  Index k_, m_, n_;
-  Index kc_, mc_, nc_;
-  Index outer_k_, outer_m_, outer_n_;
+  StorageIndex k_, m_, n_;
+  StorageIndex kc_, mc_, nc_;
+  StorageIndex outer_k_, outer_m_, outer_n_;
   bool copyA_, copyB_, transposeA_, transposeB_;
   size_t num_threads_;
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h
deleted file mode 100644
index a97f043c1..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h
+++ /dev/null
@@ -1,116 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H
-
-#if defined(EIGEN_USE_MKLDNN)
-// Support for MklDnn sgemm kernel in Tensor contractions:
-//
-// 1. Prepare packed Lhs/Rhs blocks from tensor expressions using
-//    DataMapper (see TensorContractionInputMapper).
-// 2. Invoke gemm kernel with packed blocks (replacement for default
-// gebp_kernel).
-
-namespace Eigen {
-namespace internal {
-
-template <typename Scalar, typename StorageIndex, typename DataMapper,
-          int StorageOrder>
-struct mkldnn_gemm_pack;
-
-// mkl_gemm_pack for ColMajor storage order.
-template <typename Scalar, typename StorageIndex, typename DataMapper>
-struct mkldnn_gemm_pack<Scalar, StorageIndex, DataMapper,
-                        /*StorageOrder*/ ColMajor> {
-  typedef typename internal::packet_traits<Scalar>::type Packet;
-  typedef typename DataMapper::LinearMapper LinearMapper;
-
-  enum { PacketSize = internal::packet_traits<Scalar>::size };
-
-  EIGEN_DONT_INLINE
-  void operator()(Scalar *block, const DataMapper &data_mapper,
-                  StorageIndex rows, StorageIndex cols) {
-    const StorageIndex unrolled_rows =
-        (rows / (4 * PacketSize)) * (4 * PacketSize);
-    const StorageIndex vectorized_rows = (rows / PacketSize) * PacketSize;
-
-    for (StorageIndex col = 0; col < cols; ++col) {
-      LinearMapper lm = data_mapper.getLinearMapper(0, col);
-
-      // Give compiler a strong possibility to unroll the loop.
-      for (StorageIndex i = 0; i < unrolled_rows; i += 4 * PacketSize) {
-        for (StorageIndex j = 0; j < 4; ++j) {
-          const Packet p = lm.template loadPacket<Packet>(i + j * PacketSize);
-          internal::pstoreu(block + j * PacketSize, p);
-        }
-        block += 4 * PacketSize;
-      }
-
-      // Process remaining rows with packets.
-      for (StorageIndex i = unrolled_rows; i < vectorized_rows;
-           i += PacketSize) {
-        const Packet p = lm.template loadPacket<Packet>(i);
-        internal::pstoreu(block, p);
-        block += PacketSize;
-      }
-
-      // Finalize with coefficients.
-      for (StorageIndex i = vectorized_rows; i < rows; ++i) {
-        *block = lm(i);
-        ++block;
-      }
-    }
-  }
-};
-
-template <typename Scalar, typename StorageIndex, typename OutputMapper,
-          bool ConjugateLhs = false, bool ConjugateRhs = false>
-struct mkldnn_gemm_kernel;
-
-// mkldnn_gemm_kernel for floats defined as a thin layer on top of mkldnn_sgemm.
-template <typename StorageIndex, typename OutputMapper, bool ConjugateLhs,
-          bool ConjugateRhs>
-struct mkldnn_gemm_kernel</*Scalar*/ float, StorageIndex, OutputMapper,
-                          ConjugateLhs, ConjugateRhs> {
-  EIGEN_DONT_INLINE
-  void operator()(const OutputMapper &output, const float *blockA,
-                  const float *blockB, const StorageIndex rows,
-                  const StorageIndex depth, const StorageIndex cols,
-                  float alpha) {
-    static const int max_index = (std::numeric_limits<int>::max)();
-
-    eigen_assert(max_index > rows);
-    eigen_assert(max_index > cols);
-    eigen_assert(max_index > depth);
-    eigen_assert(max_index > output.stride());
-
-    const int m = static_cast<int>(rows);
-    const int n = static_cast<int>(cols);
-    const int k = static_cast<int>(depth);
-
-    const char transposeA = ConjugateLhs ? 'Y' : 'N';
-    const char transposeB = ConjugateRhs ? 'Y' : 'N';
-
-    const int ldA = ConjugateLhs ? k : m;
-    const int ldB = ConjugateRhs ? n : k;
-    const int ldC = static_cast<int>(output.stride());
-
-    const float beta = 1.0;
-
-    mkldnn_status_t st = mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k,
-                                      &alpha, blockA, &ldA, blockB, &ldB, &beta,
-                                      const_cast<float*>(output.data()), &ldC);
-    eigen_assert(st == 0);
-  }
-};
-
-}  // namespace internal
-}  // namespace Eigen
-#endif  // EIGEN_USE_MKLDNN
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 0c4d2f0bf..8a464b073 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -15,177 +15,6 @@
 
 namespace Eigen {
 
-namespace internal {
-
-// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in
-// ColMajor storage order. This property is guaranteed by the
-// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack
-// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix
-// multiplication for these blocks. Default tensor contraction uses
-// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see
-// GeneralBlocPanelKernel.h for details).
-//
-// By specializing contraction kernels we can use other low level libraries to
-// perform matrix multiplication, and still rely on Eigen thread pool evaluator
-// for scaling. Assumption is that custom gemm do not use it's own threading for
-// parallelisation.
-//
-// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of
-//   multiplication, lhs tensor and rhs tensor respectively.
-//
-// - StorageIndex - index type for the tensor expressions. In practice almost
-//   always is Eigen::Index.
-//
-// - OutputMapper provides access to the memory of the output matrix. In
-//   practice it's always column major blas_data_mapper (it must be of ResScalar
-//   type).
-//
-// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional
-//   view into the Lhs/Rhs tensor expressions. In practice it's
-//   TensorContractionInputMapper, or some specialization of it based on the
-//   type of tensor expression (e.g. TensorImagePatchOp has optimized input
-//   mapper).
-//
-// TODO(ezhulenev): Use TensorContractionKernel in default tensor contraction
-// evaluator.
-template<typename ResScalar, typename LhsScalar, typename RhsScalar,
-    typename StorageIndex, typename OutputMapper, typename LhsMapper,
-    typename RhsMapper>
-struct TensorContractionKernel {
-  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
-
-  typedef internal::gemm_pack_lhs<LhsScalar, StorageIndex,
-                                  typename LhsMapper::SubMapper,
-                                  Traits::mr, Traits::LhsProgress,
-                                  typename Traits::LhsPacket4Packing, ColMajor>
-      LhsPacker;
-
-  typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex,
-                                  typename RhsMapper::SubMapper, Traits::nr,
-                                  ColMajor>
-      RhsPacker;
-
-  typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex,
-                                OutputMapper, Traits::mr, Traits::nr,
-      /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
-      GebpKernel;
-
-  EIGEN_DONT_INLINE
-  static void packLhs(LhsScalar* lhsBlock,
-                      const typename LhsMapper::SubMapper& data_mapper,
-                      const StorageIndex depth, const StorageIndex rows) {
-    LhsPacker()(lhsBlock, data_mapper, depth, rows);
-  }
-
-  EIGEN_DONT_INLINE
-  static void packRhs(RhsScalar* rhsBlock,
-                      const typename RhsMapper::SubMapper& data_mapper,
-                      const StorageIndex depth, const StorageIndex cols) {
-    RhsPacker()(rhsBlock, data_mapper, depth, cols);
-  }
-
-  EIGEN_DONT_INLINE
-  static void invoke(const OutputMapper& output_mapper,
-                     const LhsScalar* lhsBlock, const RhsScalar* rhsBlock,
-                     const StorageIndex rows, const StorageIndex depth,
-                     const StorageIndex cols, const ResScalar alpha) {
-    GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
-        /*strideA*/ -1, /*strideB*/ -1,
-        /*offsetA*/ 0, /*offsetB*/ 0);
-  }
-};
-
-// Some tensor contraction kernels might rely on the gemm libraries that are
-// optimized for a specific dimension sizes. By default Eigen picks block
-// sizes to fit the working set in the L1/L2 caches, by specializing we can
-// refine this choice and round up these sizes to work well with underlying gemm
-// library.
-// TODO(ezhulenev): Move it to TensorContractionBlocking, or keep separate?
-template<typename ResScalar, typename LhsScalar, typename RhsScalar,
-    typename StorageIndex>
-struct TensorContractionKernelBlocking {
-  static void refine(const StorageIndex /*m*/,
-                     const StorageIndex /*n*/,
-                     const StorageIndex /*k*/,
-                     StorageIndex* /*bm*/,
-                     StorageIndex* /*bn*/,
-                     StorageIndex* /*bk*/) {
-    // By default we do nothing and stick to the block sizes picked by Eigen.
-  }
-};
-
-#if defined(EIGEN_USE_MKLDNN)
-// If all scalar types in tensor contraction are floats, we can use mkldnn gemm
-// as our low level kernel.
-template<typename StorageIndex, typename OutputMapper, typename LhsMapper,
-    typename RhsMapper>
-struct TensorContractionKernel<float, float, float, StorageIndex, OutputMapper,
-                               LhsMapper, RhsMapper> {
-  // For now mkldnn has only mkldnn_sgemm (gemm for floats).
-  typedef float Scalar;
-
-  typedef typename internal::gebp_traits<Scalar, Scalar> Traits;
-
-  typedef internal::mkldnn_gemm_pack<Scalar, StorageIndex,
-                                     typename LhsMapper::SubMapper, ColMajor>
-      LhsPacker;
-
-  typedef internal::mkldnn_gemm_pack<Scalar, StorageIndex,
-                                     typename RhsMapper::SubMapper, ColMajor>
-      RhsPacker;
-
-  typedef internal::mkldnn_gemm_kernel<Scalar, StorageIndex, OutputMapper>
-      GemmKernel;
-
-  EIGEN_DONT_INLINE
-  static void packLhs(Scalar* lhsBlock,
-                      const typename LhsMapper::SubMapper& data_mapper,
-                      StorageIndex depth, StorageIndex rows) {
-    LhsPacker()(lhsBlock, data_mapper, rows, depth);
-  }
-
-  EIGEN_DONT_INLINE
-  static void packRhs(Scalar* rhsBlock,
-                      const typename RhsMapper::SubMapper& data_mapper,
-                      const StorageIndex depth, const StorageIndex cols) {
-    RhsPacker()(rhsBlock, data_mapper, depth, cols);
-  }
-
-  EIGEN_DONT_INLINE
-  static void invoke(const OutputMapper& output_mapper, const Scalar* lhsBlock,
-                     const Scalar* rhsBlock, const StorageIndex rows,
-                     const StorageIndex depth, const StorageIndex cols,
-                     const Scalar alpha) {
-    GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha);
-  }
-};
-
-// For mkldnn_sgemm having the right dimensions (especially for small matrices)
-// is more important than fitting all the working set in L1/L2 caches.
-template<typename StorageIndex>
-struct TensorContractionKernelBlocking<float, float, float, StorageIndex> {
-  // Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48. We pick the largest.
-  static const StorageIndex kUnrollM = 48;
-  // Mkldnn Avx/Avx2/Avx512 unroll factors are: 6/6/8. We pick the closest
-  // number that divides to both of them.
-  static const StorageIndex kUnrollN = 24;
-
-  static void refine(const StorageIndex m,
-                     const StorageIndex n,
-                     const StorageIndex /*k*/,
-                     StorageIndex* bm,
-                     StorageIndex* bn,
-                     StorageIndex* /*bk*/) {
-    // TODO(ezhulenev): There is probably a better way to pick block sizes.
-    *bm = (std::min)(m, Eigen::divup(*bm, kUnrollM) * kUnrollM);
-    *bn = (std::min)(n, Eigen::divup(*bn, kUnrollN) * kUnrollN);
-    // Stick with default bk.
-  }
-};
-
-#endif  // EIGEN_USE_MKLDNN
-} // namespace internal
-
 template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
 struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> :
     public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> > {
@@ -295,14 +124,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // Again, we don't know number of threads yet, so we use 2.
     Index bm, bn, bk;
     if (shard_by_col) {
-      internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
                                           internal::ShardByCol>
           blocking(k, m, n, 2);
       bm = blocking.mc();
       bn = blocking.nc();
       bk = blocking.kc();
     } else {
-      internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
                                           internal::ShardByRow>
           blocking(k, m, n, 2);
       bm = blocking.mc();
@@ -332,24 +161,20 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // Now that we know number of threads, recalculate sharding and blocking.
     shard_by_col = shardByCol(m, n, num_threads);
     if (shard_by_col) {
-      internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
                                           internal::ShardByCol>
           blocking(k, m, n, num_threads);
       bm = blocking.mc();
       bn = blocking.nc();
       bk = blocking.kc();
     } else {
-      internal::TensorContractionBlocking<LhsScalar, RhsScalar, Index,
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
                                           internal::ShardByRow>
           blocking(k, m, n, num_threads);
       bm = blocking.mc();
       bn = blocking.nc();
       bk = blocking.kc();
     }
-    // Refine blocking choice to work well with contraction kernel.
-    internal::TensorContractionKernelBlocking<Scalar, LhsScalar, RhsScalar,
-                                              Index>::refine(m, n, k, &bm,
-                                                             &bn, &bk);
 
     // Number of kernels for each dimension.
     Index nm0 = divup(m, bm);
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 2fd4e1b97..875842272 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -23,17 +23,6 @@ else(XSMM_FOUND)
   ei_add_property(EIGEN_MISSING_BACKENDS  "Xsmm, ")
 endif(XSMM_FOUND)
 
-find_package(Mkldnn)
-if(MKLDNN_FOUND)
-  add_definitions("-DEIGEN_USE_MKLDNN")
-  include_directories(${MKLDNN_INCLUDES})
-  link_directories(${MKLDNN_LIBRARIES})
-  set(EXTERNAL_LIBS ${EXTERNAL_LIBS} mkldnn)
-  ei_add_property(EIGEN_TESTED_BACKENDS  "Mkldd, ")
-else(MKLDNN_FOUND)
-  ei_add_property(EIGEN_MISSING_BACKENDS  "Mkldnn, ")
-endif(MKLDNN_FOUND)
-
 find_package(GoogleHash)
 if(GOOGLEHASH_FOUND)
   add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT")
@@ -191,10 +180,6 @@ if(EIGEN_TEST_CXX11)
     ei_add_test_sycl(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG})
   endif(EIGEN_TEST_SYCL)
 
-  if (MKLDNN_FOUND)
-    ei_add_test(cxx11_tensor_contraction_mkldnn)
-  endif (MKLDNN_FOUND)
-
   ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_non_blocking_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
diff --git a/unsupported/test/cxx11_tensor_contraction_mkldnn.cpp b/unsupported/test/cxx11_tensor_contraction_mkldnn.cpp
deleted file mode 100644
index 8ba0f986d..000000000
--- a/unsupported/test/cxx11_tensor_contraction_mkldnn.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#include "main.h"
-
-#include <Eigen/CXX11/Tensor>
-
-using Eigen::internal::blas_data_mapper;
-using Eigen::internal::mkldnn_gemm_kernel;
-using Eigen::internal::mkldnn_gemm_pack;
-
-template <int NumDims>
-static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
-  array<Index, NumDims> dims;
-  for (int i = 0; i < NumDims; ++i) {
-    dims[i] = internal::random<int>(min_dim, max_dim);
-  }
-  return dims;
-}
-
-// Packing with mkldnn_gemm_pack is the same as taking a slice of 2 dimensional
-// Tensor.
-template <typename Scalar>
-static void test_mkldnn_gemm_pack() {
-  static const int Options = 0 | ColMajor;
-
-  typedef blas_data_mapper<Scalar, Index, ColMajor> DataMapper;
-  typedef mkldnn_gemm_pack<Scalar, Index, DataMapper, ColMajor> MkldnnGemmPack;
-  typedef Tensor<Scalar, 2, Options, Index> Tensor2d;
-
-  array<Index, 2> dims = RandomDims<2>(1, 500);
-
-  // Create a tensor initialized with random data.
-  Tensor2d src(dims);
-  src.setRandom();
-
-  // Pick a random slice of src tensor.
-  array<Index, 2> slice_start = RandomDims<2>(0, 250);
-  array<Index, 2> slice_size = RandomDims<2>(100, 500);
-  // Make sure that slice start + size do not overflow tensor dims.
-  for (int i = 0; i < 2; ++i) {
-    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
-    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
-  }
-
-  // Prepare tensors for packing and slicing results.
-  Tensor2d pack_dst(slice_size[0], slice_size[1]);
-  Tensor2d slice_dst(slice_size[0], slice_size[1]);
-
-  // Pack memory using mkldnn_gemm_pack.
-  DataMapper data_mapper(src.data(), dims[0]);
-  MkldnnGemmPack gemm_pack;
-  gemm_pack(pack_dst.data(),
-            data_mapper.getSubMapper(slice_start[0], slice_start[1]),
-            slice_size[0], slice_size[1]);
-  // Slice the source tensor.
-  slice_dst = src.slice(slice_start, slice_size);
-
-  // Verify that dst tensors are equal.
-  VERIFY_IS_EQUAL(pack_dst.dimensions().TotalSize(),
-                  slice_dst.dimensions().TotalSize());
-  for (Index i = 0; i < pack_dst.dimensions().TotalSize(); ++i) {
-    Scalar packed = pack_dst.coeff(i);
-    Scalar sliced = slice_dst.coeff(i);
-    VERIFY_IS_EQUAL(packed, sliced);
-  }
-}
-template <typename Scalar>
-static void test_mkldnn_gemm_kernel() {
-  static const int Options = 0 | ColMajor;
-
-  typedef Tensor<Scalar, 2, Options, Index> Tensor2d;
-
-  int m = internal::random<int>(1, 100);
-  int n = internal::random<int>(1, 100);
-  int k = internal::random<int>(1, 100);
-
-  Tensor2d lhs(m, k);
-  lhs.setRandom();
-
-  Tensor2d rhs(k, n);
-  rhs.setRandom();
-
-  // Compute matmul with mkldnn gemm kernel.
-  typedef blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-  typedef mkldnn_gemm_kernel<Scalar, Index, OutputMapper, ColMajor>
-      MkldnnGemmKernel;
-
-  Tensor2d mkldnn_result(m, n);
-  mkldnn_result.setZero();
-
-  OutputMapper output_mapper(mkldnn_result.data(), m);
-  MkldnnGemmKernel gemm_kernel;
-  gemm_kernel(output_mapper, lhs.data(), rhs.data(), m, k, n, /*alpha*/ 1.0);
-
-  // Compute matmul with Eigen::Matrix.
-  typedef Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor> Matrix;
-  typedef Map<Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor> > MatrixMap;
-
-  MatrixMap lhs_mat(lhs.data(), m, k);
-  MatrixMap rhs_mat(rhs.data(), k, n);
-  Matrix matmul_result(m, n);
-  matmul_result.setZero();
-
-  matmul_result = lhs_mat * rhs_mat;
-
-  static const float error_threshold = 1e-4f;
-
-  // Verify that results are equal.
-  for (Index i = 0; i < m * n; ++i) {
-    Scalar gemm = mkldnn_result(i);
-    Scalar matmul = matmul_result(i % m, i / m);
-    if ((std::abs)(gemm) > error_threshold &&
-        (std::abs)(matmul) > error_threshold) {
-      if (!Eigen::internal::isApprox(gemm, matmul, error_threshold))
-        std::cout << "gemm=" << gemm << " matmul=" << matmul << std::endl;
-      VERIFY(Eigen::internal::isApprox(gemm, matmul, error_threshold));
-    }
-  }
-}
-
-EIGEN_DECLARE_TEST(cxx11_tensor_contraction_mkldnn) {
-  CALL_SUBTEST(test_mkldnn_gemm_pack<float>());
-  CALL_SUBTEST(test_mkldnn_gemm_pack<double>());
-
-  // mkldnn has only sgemm (aka gemm for floats).
-  CALL_SUBTEST(test_mkldnn_gemm_kernel<float>());
-}