From 9f4988959f1b0394ee027f474f49916543ad2f3c Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Thu, 27 Sep 2018 11:49:19 -0700 Subject: Remove explicit mkldnn support and redundant TensorContractionKernelBlocking --- cmake/FindMkldnn.cmake | 18 -- unsupported/Eigen/CXX11/Tensor | 5 - .../Eigen/CXX11/src/Tensor/TensorContraction.h | 102 ++++++++++-- .../CXX11/src/Tensor/TensorContractionBlocking.h | 44 ++--- .../CXX11/src/Tensor/TensorContractionMkldnn.h | 116 ------------- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 183 +-------------------- unsupported/test/CMakeLists.txt | 15 -- .../test/cxx11_tensor_contraction_mkldnn.cpp | 134 --------------- 8 files changed, 113 insertions(+), 504 deletions(-) delete mode 100644 cmake/FindMkldnn.cmake delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h delete mode 100644 unsupported/test/cxx11_tensor_contraction_mkldnn.cpp diff --git a/cmake/FindMkldnn.cmake b/cmake/FindMkldnn.cmake deleted file mode 100644 index 6acba9d8c..000000000 --- a/cmake/FindMkldnn.cmake +++ /dev/null @@ -1,18 +0,0 @@ -# Intel mkl-dnn support. -# Link: https://github.com/intel/mkl-dnn -if (MKLDNN) - set(MKLDNN_FIND_QUIETLY TRUE) - set(MKLDNN_INCLUDES ${MKLDNN}/include) - set(MKLDNN_LIBRARIES ${MKLDNN}/lib) -endif (MKLDNN) -find_path(MKLDNN - NAMES - mkldnn.h - PATHS - $ENV{MKLDNNDIR}/include - ${INCLUDE_INSTALL_DIR} - ) -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(MKLDNN DEFAULT_MSG - MKLDNN) -mark_as_advanced(MKLDNN) \ No newline at end of file diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 420afe650..14a2d0f0d 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -75,10 +75,6 @@ typedef unsigned __int64 uint64_t; #include "libxsmm.h" #endif -#if defined(EIGEN_USE_MKLDNN) -#include "mkldnn.h" -#endif - #ifdef EIGEN_USE_THREADS #include "ThreadPool" #endif @@ -125,7 +121,6 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorArgMax.h" #include "src/Tensor/TensorConcatenation.h" #include "src/Tensor/TensorContractionMapper.h" -#include "src/Tensor/TensorContractionMkldnn.h" #include "src/Tensor/TensorContractionBlocking.h" #include "src/Tensor/TensorContraction.h" #include "src/Tensor/TensorContractionThreadPool.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 3b22e43e7..d220f82be 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -136,6 +136,81 @@ struct traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; }; +// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in +// ColMajor storage order. This property is guaranteed by the +// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack +// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix +// multiplication for these blocks. Default tensor contraction uses +// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see +// GeneralBlocPanelKernel.h for details). +// +// By specializing contraction kernels we can use other low level libraries to +// perform matrix multiplication, and still rely on Eigen contraction evaluator. +// This also includes full support in TensorContractionThreadPool, assuming that +// underlying gemm do not use it's own threading. +// +// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of +// multiplication, lhs tensor and rhs tensor respectively. +// +// - StorageIndex - index type for the tensor expressions. In practice almost +// always is Eigen::Index. +// +// - OutputMapper provides access to the memory of the output matrix. In +// practice it's always column major blas_data_mapper (it must be of ResScalar +// type). +// +// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional +// view into the Lhs/Rhs tensor expressions. In practice it's +// TensorContractionInputMapper, or some specialization of it based on the +// type of tensor expression (e.g. TensorImagePatchOp has optimized input +// mapper). +template +struct TensorContractionKernel { + typedef typename internal::gebp_traits Traits; + + typedef internal::gemm_pack_lhs + LhsPacker; + + typedef internal::gemm_pack_rhs + RhsPacker; + + typedef internal::gebp_kernel + GebpKernel; + + EIGEN_DONT_INLINE + static void packLhs(LhsScalar* lhsBlock, + const typename LhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex rows) { + LhsPacker()(lhsBlock, data_mapper, depth, rows, /*stride*/ 0, /*offset*/ 0); + } + + EIGEN_DONT_INLINE + static void packRhs(RhsScalar* rhsBlock, + const typename RhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex cols) { + RhsPacker()(rhsBlock, data_mapper, depth, cols); + } + + EIGEN_DONT_INLINE + static void invoke(const OutputMapper& output_mapper, + const LhsScalar* lhsBlock, const RhsScalar* rhsBlock, + const StorageIndex rows, const StorageIndex depth, + const StorageIndex cols, const ResScalar alpha) { + GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha, + /*strideA*/ -1, /*strideB*/ -1, + /*offsetA*/ 0, /*offsetB*/ 0); + } +}; + } // end namespace internal // Tensor contraction params that should enable to get from output matrix @@ -591,13 +666,9 @@ struct TensorContractionEvaluatorBase // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - // define mr, nr, and all of my data mapper types + // define data mappers for Lhs and Rhs typedef typename internal::remove_const::type LhsScalar; typedef typename internal::remove_const::type RhsScalar; - typedef typename internal::gebp_traits Traits; - - const Index nr = Traits::nr; - const Index mr = Traits::mr; typedef TensorEvaluator LeftEvaluator; typedef TensorEvaluator RightEvaluator; @@ -619,11 +690,9 @@ struct TensorContractionEvaluatorBase typedef internal::blas_data_mapper OutputMapper; - // Declare GEBP packing and kernel structs - internal::gemm_pack_lhs pack_lhs; - internal::gemm_pack_rhs pack_rhs; - - internal::gebp_kernel gebp; + typedef internal::TensorContractionKernel< + Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper> + TensorContractionKernel; // initialize data mappers LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, @@ -635,7 +704,7 @@ struct TensorContractionEvaluatorBase OutputMapper output(buffer, m); // Sizes of the blocks to load in cache. See the Goto paper for details. - internal::TensorContractionBlocking blocking(k, m, n, 1); + internal::TensorContractionBlocking blocking(k, m, n, 1); const Index kc = blocking.kc(); const Index mc = numext::mini(m, blocking.mc()); const Index nc = numext::mini(n, blocking.nc()); @@ -651,19 +720,22 @@ struct TensorContractionEvaluatorBase for (Index k2 = 0; k2 < k; k2 += kc) { // make sure we don't overshoot right edge of left matrix, then pack vertical panel const Index actual_kc = numext::mini(k2 + kc, k) - k2; - pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0); + TensorContractionKernel::packLhs(blockA, lhs.getSubMapper(i2, k2), + actual_kc, actual_mc); // series of horizontal blocks for (Index j2 = 0; j2 < n; j2 += nc) { // make sure we don't overshoot right edge of right matrix, then pack block const Index actual_nc = numext::mini(j2 + nc, n) - j2; - pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0); + TensorContractionKernel::packRhs(blockB, rhs.getSubMapper(k2, j2), + actual_kc, actual_nc); // call gebp (matrix kernel) // The parameters here are copied from Eigen's GEMM implementation const OutputMapper output_mapper = output.getSubMapper(i2, j2); - gebp(output_mapper, blockA, blockB, actual_mc, actual_kc, actual_nc, - Scalar(1), -1, -1, 0, 0); + TensorContractionKernel::invoke(output_mapper, blockA, blockB, + actual_mc, actual_kc, actual_nc, + Scalar(1)); // We are done with this [i2, j2] output block. if (k2 + kc >= k) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index cf281192c..71fd19774 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -21,7 +21,7 @@ enum { // Default Blocking Strategy -template +template class TensorContractionBlocking { public: @@ -42,7 +42,7 @@ class TensorContractionBlocking { #if !defined(EIGEN_HIPCC) EIGEN_DEVICE_FUNC #endif - TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : + TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1) : kc_(k), mc_(m), nc_(n) { if (ShardingType == ShardByCol) { @@ -53,23 +53,23 @@ class TensorContractionBlocking { } } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; } private: - Index kc_; - Index mc_; - Index nc_; + StorageIndex kc_; + StorageIndex mc_; + StorageIndex nc_; }; #if defined(EIGEN_USE_LIBXSMM) -template +template class TensorXsmmContractionBlocking { public: - TensorXsmmContractionBlocking(Index k, Index m, Index n, + TensorXsmmContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, size_t max_num_threads = 1, bool transposeA = false, bool transposeB = false): k_(k), m_(m), n_(n), transposeA_(transposeA), @@ -164,28 +164,28 @@ class TensorXsmmContractionBlocking { eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n); } - EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } - EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } - EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } - EIGEN_ALWAYS_INLINE Index outer_k() const { return outer_k_; } - EIGEN_ALWAYS_INLINE Index outer_m() const { return outer_m_; } - EIGEN_ALWAYS_INLINE Index outer_n() const { return outer_n_; } + EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; } + EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; } + EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; } + EIGEN_ALWAYS_INLINE StorageIndex outer_k() const { return outer_k_; } + EIGEN_ALWAYS_INLINE StorageIndex outer_m() const { return outer_m_; } + EIGEN_ALWAYS_INLINE StorageIndex outer_n() const { return outer_n_; } EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; } EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; } EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; } EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; } EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; } - EIGEN_ALWAYS_INLINE Index blocks_m() const { return divup(m_, mc_); } - EIGEN_ALWAYS_INLINE Index blocks_k() const { return divup(k_, kc_); } - EIGEN_ALWAYS_INLINE Index blocks_n() const { return divup(n_, nc_); } + EIGEN_ALWAYS_INLINE StorageIndex blocks_m() const { return divup(m_, mc_); } + EIGEN_ALWAYS_INLINE StorageIndex blocks_k() const { return divup(k_, kc_); } + EIGEN_ALWAYS_INLINE StorageIndex blocks_n() const { return divup(n_, nc_); } EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const { return prefetch_; } private: - Index k_, m_, n_; - Index kc_, mc_, nc_; - Index outer_k_, outer_m_, outer_n_; + StorageIndex k_, m_, n_; + StorageIndex kc_, mc_, nc_; + StorageIndex outer_k_, outer_m_, outer_n_; bool copyA_, copyB_, transposeA_, transposeB_; size_t num_threads_; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h deleted file mode 100644 index a97f043c1..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h +++ /dev/null @@ -1,116 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2018 Eugene Zhulenev -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H - -#if defined(EIGEN_USE_MKLDNN) -// Support for MklDnn sgemm kernel in Tensor contractions: -// -// 1. Prepare packed Lhs/Rhs blocks from tensor expressions using -// DataMapper (see TensorContractionInputMapper). -// 2. Invoke gemm kernel with packed blocks (replacement for default -// gebp_kernel). - -namespace Eigen { -namespace internal { - -template -struct mkldnn_gemm_pack; - -// mkl_gemm_pack for ColMajor storage order. -template -struct mkldnn_gemm_pack { - typedef typename internal::packet_traits::type Packet; - typedef typename DataMapper::LinearMapper LinearMapper; - - enum { PacketSize = internal::packet_traits::size }; - - EIGEN_DONT_INLINE - void operator()(Scalar *block, const DataMapper &data_mapper, - StorageIndex rows, StorageIndex cols) { - const StorageIndex unrolled_rows = - (rows / (4 * PacketSize)) * (4 * PacketSize); - const StorageIndex vectorized_rows = (rows / PacketSize) * PacketSize; - - for (StorageIndex col = 0; col < cols; ++col) { - LinearMapper lm = data_mapper.getLinearMapper(0, col); - - // Give compiler a strong possibility to unroll the loop. - for (StorageIndex i = 0; i < unrolled_rows; i += 4 * PacketSize) { - for (StorageIndex j = 0; j < 4; ++j) { - const Packet p = lm.template loadPacket(i + j * PacketSize); - internal::pstoreu(block + j * PacketSize, p); - } - block += 4 * PacketSize; - } - - // Process remaining rows with packets. - for (StorageIndex i = unrolled_rows; i < vectorized_rows; - i += PacketSize) { - const Packet p = lm.template loadPacket(i); - internal::pstoreu(block, p); - block += PacketSize; - } - - // Finalize with coefficients. - for (StorageIndex i = vectorized_rows; i < rows; ++i) { - *block = lm(i); - ++block; - } - } - } -}; - -template -struct mkldnn_gemm_kernel; - -// mkldnn_gemm_kernel for floats defined as a thin layer on top of mkldnn_sgemm. -template -struct mkldnn_gemm_kernel { - EIGEN_DONT_INLINE - void operator()(const OutputMapper &output, const float *blockA, - const float *blockB, const StorageIndex rows, - const StorageIndex depth, const StorageIndex cols, - float alpha) { - static const int max_index = (std::numeric_limits::max)(); - - eigen_assert(max_index > rows); - eigen_assert(max_index > cols); - eigen_assert(max_index > depth); - eigen_assert(max_index > output.stride()); - - const int m = static_cast(rows); - const int n = static_cast(cols); - const int k = static_cast(depth); - - const char transposeA = ConjugateLhs ? 'Y' : 'N'; - const char transposeB = ConjugateRhs ? 'Y' : 'N'; - - const int ldA = ConjugateLhs ? k : m; - const int ldB = ConjugateRhs ? n : k; - const int ldC = static_cast(output.stride()); - - const float beta = 1.0; - - mkldnn_status_t st = mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k, - &alpha, blockA, &ldA, blockB, &ldB, &beta, - const_cast(output.data()), &ldC); - eigen_assert(st == 0); - } -}; - -} // namespace internal -} // namespace Eigen -#endif // EIGEN_USE_MKLDNN -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 0c4d2f0bf..8a464b073 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -15,177 +15,6 @@ namespace Eigen { -namespace internal { - -// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in -// ColMajor storage order. This property is guaranteed by the -// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack -// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix -// multiplication for these blocks. Default tensor contraction uses -// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see -// GeneralBlocPanelKernel.h for details). -// -// By specializing contraction kernels we can use other low level libraries to -// perform matrix multiplication, and still rely on Eigen thread pool evaluator -// for scaling. Assumption is that custom gemm do not use it's own threading for -// parallelisation. -// -// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of -// multiplication, lhs tensor and rhs tensor respectively. -// -// - StorageIndex - index type for the tensor expressions. In practice almost -// always is Eigen::Index. -// -// - OutputMapper provides access to the memory of the output matrix. In -// practice it's always column major blas_data_mapper (it must be of ResScalar -// type). -// -// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional -// view into the Lhs/Rhs tensor expressions. In practice it's -// TensorContractionInputMapper, or some specialization of it based on the -// type of tensor expression (e.g. TensorImagePatchOp has optimized input -// mapper). -// -// TODO(ezhulenev): Use TensorContractionKernel in default tensor contraction -// evaluator. -template -struct TensorContractionKernel { - typedef typename internal::gebp_traits Traits; - - typedef internal::gemm_pack_lhs - LhsPacker; - - typedef internal::gemm_pack_rhs - RhsPacker; - - typedef internal::gebp_kernel - GebpKernel; - - EIGEN_DONT_INLINE - static void packLhs(LhsScalar* lhsBlock, - const typename LhsMapper::SubMapper& data_mapper, - const StorageIndex depth, const StorageIndex rows) { - LhsPacker()(lhsBlock, data_mapper, depth, rows); - } - - EIGEN_DONT_INLINE - static void packRhs(RhsScalar* rhsBlock, - const typename RhsMapper::SubMapper& data_mapper, - const StorageIndex depth, const StorageIndex cols) { - RhsPacker()(rhsBlock, data_mapper, depth, cols); - } - - EIGEN_DONT_INLINE - static void invoke(const OutputMapper& output_mapper, - const LhsScalar* lhsBlock, const RhsScalar* rhsBlock, - const StorageIndex rows, const StorageIndex depth, - const StorageIndex cols, const ResScalar alpha) { - GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha, - /*strideA*/ -1, /*strideB*/ -1, - /*offsetA*/ 0, /*offsetB*/ 0); - } -}; - -// Some tensor contraction kernels might rely on the gemm libraries that are -// optimized for a specific dimension sizes. By default Eigen picks block -// sizes to fit the working set in the L1/L2 caches, by specializing we can -// refine this choice and round up these sizes to work well with underlying gemm -// library. -// TODO(ezhulenev): Move it to TensorContractionBlocking, or keep separate? -template -struct TensorContractionKernelBlocking { - static void refine(const StorageIndex /*m*/, - const StorageIndex /*n*/, - const StorageIndex /*k*/, - StorageIndex* /*bm*/, - StorageIndex* /*bn*/, - StorageIndex* /*bk*/) { - // By default we do nothing and stick to the block sizes picked by Eigen. - } -}; - -#if defined(EIGEN_USE_MKLDNN) -// If all scalar types in tensor contraction are floats, we can use mkldnn gemm -// as our low level kernel. -template -struct TensorContractionKernel { - // For now mkldnn has only mkldnn_sgemm (gemm for floats). - typedef float Scalar; - - typedef typename internal::gebp_traits Traits; - - typedef internal::mkldnn_gemm_pack - LhsPacker; - - typedef internal::mkldnn_gemm_pack - RhsPacker; - - typedef internal::mkldnn_gemm_kernel - GemmKernel; - - EIGEN_DONT_INLINE - static void packLhs(Scalar* lhsBlock, - const typename LhsMapper::SubMapper& data_mapper, - StorageIndex depth, StorageIndex rows) { - LhsPacker()(lhsBlock, data_mapper, rows, depth); - } - - EIGEN_DONT_INLINE - static void packRhs(Scalar* rhsBlock, - const typename RhsMapper::SubMapper& data_mapper, - const StorageIndex depth, const StorageIndex cols) { - RhsPacker()(rhsBlock, data_mapper, depth, cols); - } - - EIGEN_DONT_INLINE - static void invoke(const OutputMapper& output_mapper, const Scalar* lhsBlock, - const Scalar* rhsBlock, const StorageIndex rows, - const StorageIndex depth, const StorageIndex cols, - const Scalar alpha) { - GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha); - } -}; - -// For mkldnn_sgemm having the right dimensions (especially for small matrices) -// is more important than fitting all the working set in L1/L2 caches. -template -struct TensorContractionKernelBlocking { - // Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48. We pick the largest. - static const StorageIndex kUnrollM = 48; - // Mkldnn Avx/Avx2/Avx512 unroll factors are: 6/6/8. We pick the closest - // number that divides to both of them. - static const StorageIndex kUnrollN = 24; - - static void refine(const StorageIndex m, - const StorageIndex n, - const StorageIndex /*k*/, - StorageIndex* bm, - StorageIndex* bn, - StorageIndex* /*bk*/) { - // TODO(ezhulenev): There is probably a better way to pick block sizes. - *bm = (std::min)(m, Eigen::divup(*bm, kUnrollM) * kUnrollM); - *bn = (std::min)(n, Eigen::divup(*bn, kUnrollN) * kUnrollN); - // Stick with default bk. - } -}; - -#endif // EIGEN_USE_MKLDNN -} // namespace internal - template struct TensorEvaluator, ThreadPoolDevice> : public TensorContractionEvaluatorBase, ThreadPoolDevice> > { @@ -295,14 +124,14 @@ struct TensorEvaluator blocking(k, m, n, 2); bm = blocking.mc(); bn = blocking.nc(); bk = blocking.kc(); } else { - internal::TensorContractionBlocking blocking(k, m, n, 2); bm = blocking.mc(); @@ -332,24 +161,20 @@ struct TensorEvaluator blocking(k, m, n, num_threads); bm = blocking.mc(); bn = blocking.nc(); bk = blocking.kc(); } else { - internal::TensorContractionBlocking blocking(k, m, n, num_threads); bm = blocking.mc(); bn = blocking.nc(); bk = blocking.kc(); } - // Refine blocking choice to work well with contraction kernel. - internal::TensorContractionKernelBlocking::refine(m, n, k, &bm, - &bn, &bk); // Number of kernels for each dimension. Index nm0 = divup(m, bm); diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 2fd4e1b97..875842272 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -23,17 +23,6 @@ else(XSMM_FOUND) ei_add_property(EIGEN_MISSING_BACKENDS "Xsmm, ") endif(XSMM_FOUND) -find_package(Mkldnn) -if(MKLDNN_FOUND) - add_definitions("-DEIGEN_USE_MKLDNN") - include_directories(${MKLDNN_INCLUDES}) - link_directories(${MKLDNN_LIBRARIES}) - set(EXTERNAL_LIBS ${EXTERNAL_LIBS} mkldnn) - ei_add_property(EIGEN_TESTED_BACKENDS "Mkldd, ") -else(MKLDNN_FOUND) - ei_add_property(EIGEN_MISSING_BACKENDS "Mkldnn, ") -endif(MKLDNN_FOUND) - find_package(GoogleHash) if(GOOGLEHASH_FOUND) add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT") @@ -191,10 +180,6 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG}) endif(EIGEN_TEST_SYCL) - if (MKLDNN_FOUND) - ei_add_test(cxx11_tensor_contraction_mkldnn) - endif (MKLDNN_FOUND) - ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_non_blocking_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}") diff --git a/unsupported/test/cxx11_tensor_contraction_mkldnn.cpp b/unsupported/test/cxx11_tensor_contraction_mkldnn.cpp deleted file mode 100644 index 8ba0f986d..000000000 --- a/unsupported/test/cxx11_tensor_contraction_mkldnn.cpp +++ /dev/null @@ -1,134 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2018 Eugene Zhulenev -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#include "main.h" - -#include - -using Eigen::internal::blas_data_mapper; -using Eigen::internal::mkldnn_gemm_kernel; -using Eigen::internal::mkldnn_gemm_pack; - -template -static array RandomDims(int min_dim = 1, int max_dim = 20) { - array dims; - for (int i = 0; i < NumDims; ++i) { - dims[i] = internal::random(min_dim, max_dim); - } - return dims; -} - -// Packing with mkldnn_gemm_pack is the same as taking a slice of 2 dimensional -// Tensor. -template -static void test_mkldnn_gemm_pack() { - static const int Options = 0 | ColMajor; - - typedef blas_data_mapper DataMapper; - typedef mkldnn_gemm_pack MkldnnGemmPack; - typedef Tensor Tensor2d; - - array dims = RandomDims<2>(1, 500); - - // Create a tensor initialized with random data. - Tensor2d src(dims); - src.setRandom(); - - // Pick a random slice of src tensor. - array slice_start = RandomDims<2>(0, 250); - array slice_size = RandomDims<2>(100, 500); - // Make sure that slice start + size do not overflow tensor dims. - for (int i = 0; i < 2; ++i) { - slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]); - slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]); - } - - // Prepare tensors for packing and slicing results. - Tensor2d pack_dst(slice_size[0], slice_size[1]); - Tensor2d slice_dst(slice_size[0], slice_size[1]); - - // Pack memory using mkldnn_gemm_pack. - DataMapper data_mapper(src.data(), dims[0]); - MkldnnGemmPack gemm_pack; - gemm_pack(pack_dst.data(), - data_mapper.getSubMapper(slice_start[0], slice_start[1]), - slice_size[0], slice_size[1]); - // Slice the source tensor. - slice_dst = src.slice(slice_start, slice_size); - - // Verify that dst tensors are equal. - VERIFY_IS_EQUAL(pack_dst.dimensions().TotalSize(), - slice_dst.dimensions().TotalSize()); - for (Index i = 0; i < pack_dst.dimensions().TotalSize(); ++i) { - Scalar packed = pack_dst.coeff(i); - Scalar sliced = slice_dst.coeff(i); - VERIFY_IS_EQUAL(packed, sliced); - } -} -template -static void test_mkldnn_gemm_kernel() { - static const int Options = 0 | ColMajor; - - typedef Tensor Tensor2d; - - int m = internal::random(1, 100); - int n = internal::random(1, 100); - int k = internal::random(1, 100); - - Tensor2d lhs(m, k); - lhs.setRandom(); - - Tensor2d rhs(k, n); - rhs.setRandom(); - - // Compute matmul with mkldnn gemm kernel. - typedef blas_data_mapper OutputMapper; - typedef mkldnn_gemm_kernel - MkldnnGemmKernel; - - Tensor2d mkldnn_result(m, n); - mkldnn_result.setZero(); - - OutputMapper output_mapper(mkldnn_result.data(), m); - MkldnnGemmKernel gemm_kernel; - gemm_kernel(output_mapper, lhs.data(), rhs.data(), m, k, n, /*alpha*/ 1.0); - - // Compute matmul with Eigen::Matrix. - typedef Eigen::Matrix Matrix; - typedef Map > MatrixMap; - - MatrixMap lhs_mat(lhs.data(), m, k); - MatrixMap rhs_mat(rhs.data(), k, n); - Matrix matmul_result(m, n); - matmul_result.setZero(); - - matmul_result = lhs_mat * rhs_mat; - - static const float error_threshold = 1e-4f; - - // Verify that results are equal. - for (Index i = 0; i < m * n; ++i) { - Scalar gemm = mkldnn_result(i); - Scalar matmul = matmul_result(i % m, i / m); - if ((std::abs)(gemm) > error_threshold && - (std::abs)(matmul) > error_threshold) { - if (!Eigen::internal::isApprox(gemm, matmul, error_threshold)) - std::cout << "gemm=" << gemm << " matmul=" << matmul << std::endl; - VERIFY(Eigen::internal::isApprox(gemm, matmul, error_threshold)); - } - } -} - -EIGEN_DECLARE_TEST(cxx11_tensor_contraction_mkldnn) { - CALL_SUBTEST(test_mkldnn_gemm_pack()); - CALL_SUBTEST(test_mkldnn_gemm_pack()); - - // mkldnn has only sgemm (aka gemm for floats). - CALL_SUBTEST(test_mkldnn_gemm_kernel()); -} -- cgit v1.2.3