aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Eugene Zhulenev <ezhulenev@google.com>2018-09-26 11:08:47 -0700
committerGravatar Eugene Zhulenev <ezhulenev@google.com>2018-09-26 11:08:47 -0700
commit71cd3fbd6a03991977fd3faf82109bf27b701d2d (patch)
treebb017d89f9901d9d6d550eb20f9c5af3d0dd2aae
parent0a3356f4ece30cd486b616eb1da9128aa4f245be (diff)
Support multiple contraction kernel types in TensorContractionThreadPool
-rw-r--r--cmake/FindMkldnn.cmake18
-rw-r--r--unsupported/Eigen/CXX11/Tensor5
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h116
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h212
-rw-r--r--unsupported/test/CMakeLists.txt12
-rw-r--r--unsupported/test/cxx11_tensor_contraction_mkldnn.cpp141
6 files changed, 484 insertions, 20 deletions
diff --git a/cmake/FindMkldnn.cmake b/cmake/FindMkldnn.cmake
new file mode 100644
index 000000000..6acba9d8c
--- /dev/null
+++ b/cmake/FindMkldnn.cmake
@@ -0,0 +1,18 @@
+# Intel mkl-dnn support.
+# Link: https://github.com/intel/mkl-dnn
+if (MKLDNN)
+ set(MKLDNN_FIND_QUIETLY TRUE)
+ set(MKLDNN_INCLUDES ${MKLDNN}/include)
+ set(MKLDNN_LIBRARIES ${MKLDNN}/lib)
+endif (MKLDNN)
+find_path(MKLDNN
+ NAMES
+ mkldnn.h
+ PATHS
+ $ENV{MKLDNNDIR}/include
+ ${INCLUDE_INSTALL_DIR}
+ )
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MKLDNN DEFAULT_MSG
+ MKLDNN)
+mark_as_advanced(MKLDNN) \ No newline at end of file
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 14a2d0f0d..420afe650 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -75,6 +75,10 @@ typedef unsigned __int64 uint64_t;
#include "libxsmm.h"
#endif
+#if defined(EIGEN_USE_MKLDNN)
+#include "mkldnn.h"
+#endif
+
#ifdef EIGEN_USE_THREADS
#include "ThreadPool"
#endif
@@ -121,6 +125,7 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorArgMax.h"
#include "src/Tensor/TensorConcatenation.h"
#include "src/Tensor/TensorContractionMapper.h"
+#include "src/Tensor/TensorContractionMkldnn.h"
#include "src/Tensor/TensorContractionBlocking.h"
#include "src/Tensor/TensorContraction.h"
#include "src/Tensor/TensorContractionThreadPool.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h
new file mode 100644
index 000000000..a97f043c1
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMkldnn.h
@@ -0,0 +1,116 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H
+
+#if defined(EIGEN_USE_MKLDNN)
+// Support for MklDnn sgemm kernel in Tensor contractions:
+//
+// 1. Prepare packed Lhs/Rhs blocks from tensor expressions using
+// DataMapper (see TensorContractionInputMapper).
+// 2. Invoke gemm kernel with packed blocks (replacement for default
+// gebp_kernel).
+
+namespace Eigen {
+namespace internal {
+
+template <typename Scalar, typename StorageIndex, typename DataMapper,
+ int StorageOrder>
+struct mkldnn_gemm_pack;
+
+// mkl_gemm_pack for ColMajor storage order.
+template <typename Scalar, typename StorageIndex, typename DataMapper>
+struct mkldnn_gemm_pack<Scalar, StorageIndex, DataMapper,
+ /*StorageOrder*/ ColMajor> {
+ typedef typename internal::packet_traits<Scalar>::type Packet;
+ typedef typename DataMapper::LinearMapper LinearMapper;
+
+ enum { PacketSize = internal::packet_traits<Scalar>::size };
+
+ EIGEN_DONT_INLINE
+ void operator()(Scalar *block, const DataMapper &data_mapper,
+ StorageIndex rows, StorageIndex cols) {
+ const StorageIndex unrolled_rows =
+ (rows / (4 * PacketSize)) * (4 * PacketSize);
+ const StorageIndex vectorized_rows = (rows / PacketSize) * PacketSize;
+
+ for (StorageIndex col = 0; col < cols; ++col) {
+ LinearMapper lm = data_mapper.getLinearMapper(0, col);
+
+ // Give compiler a strong possibility to unroll the loop.
+ for (StorageIndex i = 0; i < unrolled_rows; i += 4 * PacketSize) {
+ for (StorageIndex j = 0; j < 4; ++j) {
+ const Packet p = lm.template loadPacket<Packet>(i + j * PacketSize);
+ internal::pstoreu(block + j * PacketSize, p);
+ }
+ block += 4 * PacketSize;
+ }
+
+ // Process remaining rows with packets.
+ for (StorageIndex i = unrolled_rows; i < vectorized_rows;
+ i += PacketSize) {
+ const Packet p = lm.template loadPacket<Packet>(i);
+ internal::pstoreu(block, p);
+ block += PacketSize;
+ }
+
+ // Finalize with coefficients.
+ for (StorageIndex i = vectorized_rows; i < rows; ++i) {
+ *block = lm(i);
+ ++block;
+ }
+ }
+ }
+};
+
+template <typename Scalar, typename StorageIndex, typename OutputMapper,
+ bool ConjugateLhs = false, bool ConjugateRhs = false>
+struct mkldnn_gemm_kernel;
+
+// mkldnn_gemm_kernel for floats defined as a thin layer on top of mkldnn_sgemm.
+template <typename StorageIndex, typename OutputMapper, bool ConjugateLhs,
+ bool ConjugateRhs>
+struct mkldnn_gemm_kernel</*Scalar*/ float, StorageIndex, OutputMapper,
+ ConjugateLhs, ConjugateRhs> {
+ EIGEN_DONT_INLINE
+ void operator()(const OutputMapper &output, const float *blockA,
+ const float *blockB, const StorageIndex rows,
+ const StorageIndex depth, const StorageIndex cols,
+ float alpha) {
+ static const int max_index = (std::numeric_limits<int>::max)();
+
+ eigen_assert(max_index > rows);
+ eigen_assert(max_index > cols);
+ eigen_assert(max_index > depth);
+ eigen_assert(max_index > output.stride());
+
+ const int m = static_cast<int>(rows);
+ const int n = static_cast<int>(cols);
+ const int k = static_cast<int>(depth);
+
+ const char transposeA = ConjugateLhs ? 'Y' : 'N';
+ const char transposeB = ConjugateRhs ? 'Y' : 'N';
+
+ const int ldA = ConjugateLhs ? k : m;
+ const int ldB = ConjugateRhs ? n : k;
+ const int ldC = static_cast<int>(output.stride());
+
+ const float beta = 1.0;
+
+ mkldnn_status_t st = mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k,
+ &alpha, blockA, &ldA, blockB, &ldB, &beta,
+ const_cast<float*>(output.data()), &ldC);
+ eigen_assert(st == 0);
+ }
+};
+
+} // namespace internal
+} // namespace Eigen
+#endif // EIGEN_USE_MKLDNN
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MKLDNN_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 0980854b4..0c4d2f0bf 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -15,6 +15,177 @@
namespace Eigen {
+namespace internal {
+
+// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in
+// ColMajor storage order. This property is guaranteed by the
+// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack
+// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix
+// multiplication for these blocks. Default tensor contraction uses
+// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see
+// GeneralBlocPanelKernel.h for details).
+//
+// By specializing contraction kernels we can use other low level libraries to
+// perform matrix multiplication, and still rely on Eigen thread pool evaluator
+// for scaling. Assumption is that custom gemm do not use it's own threading for
+// parallelisation.
+//
+// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of
+// multiplication, lhs tensor and rhs tensor respectively.
+//
+// - StorageIndex - index type for the tensor expressions. In practice almost
+// always is Eigen::Index.
+//
+// - OutputMapper provides access to the memory of the output matrix. In
+// practice it's always column major blas_data_mapper (it must be of ResScalar
+// type).
+//
+// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional
+// view into the Lhs/Rhs tensor expressions. In practice it's
+// TensorContractionInputMapper, or some specialization of it based on the
+// type of tensor expression (e.g. TensorImagePatchOp has optimized input
+// mapper).
+//
+// TODO(ezhulenev): Use TensorContractionKernel in default tensor contraction
+// evaluator.
+template<typename ResScalar, typename LhsScalar, typename RhsScalar,
+ typename StorageIndex, typename OutputMapper, typename LhsMapper,
+ typename RhsMapper>
+struct TensorContractionKernel {
+ typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+ typedef internal::gemm_pack_lhs<LhsScalar, StorageIndex,
+ typename LhsMapper::SubMapper,
+ Traits::mr, Traits::LhsProgress,
+ typename Traits::LhsPacket4Packing, ColMajor>
+ LhsPacker;
+
+ typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex,
+ typename RhsMapper::SubMapper, Traits::nr,
+ ColMajor>
+ RhsPacker;
+
+ typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex,
+ OutputMapper, Traits::mr, Traits::nr,
+ /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
+ GebpKernel;
+
+ EIGEN_DONT_INLINE
+ static void packLhs(LhsScalar* lhsBlock,
+ const typename LhsMapper::SubMapper& data_mapper,
+ const StorageIndex depth, const StorageIndex rows) {
+ LhsPacker()(lhsBlock, data_mapper, depth, rows);
+ }
+
+ EIGEN_DONT_INLINE
+ static void packRhs(RhsScalar* rhsBlock,
+ const typename RhsMapper::SubMapper& data_mapper,
+ const StorageIndex depth, const StorageIndex cols) {
+ RhsPacker()(rhsBlock, data_mapper, depth, cols);
+ }
+
+ EIGEN_DONT_INLINE
+ static void invoke(const OutputMapper& output_mapper,
+ const LhsScalar* lhsBlock, const RhsScalar* rhsBlock,
+ const StorageIndex rows, const StorageIndex depth,
+ const StorageIndex cols, const ResScalar alpha) {
+ GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
+ /*strideA*/ -1, /*strideB*/ -1,
+ /*offsetA*/ 0, /*offsetB*/ 0);
+ }
+};
+
+// Some tensor contraction kernels might rely on the gemm libraries that are
+// optimized for a specific dimension sizes. By default Eigen picks block
+// sizes to fit the working set in the L1/L2 caches, by specializing we can
+// refine this choice and round up these sizes to work well with underlying gemm
+// library.
+// TODO(ezhulenev): Move it to TensorContractionBlocking, or keep separate?
+template<typename ResScalar, typename LhsScalar, typename RhsScalar,
+ typename StorageIndex>
+struct TensorContractionKernelBlocking {
+ static void refine(const StorageIndex /*m*/,
+ const StorageIndex /*n*/,
+ const StorageIndex /*k*/,
+ StorageIndex* /*bm*/,
+ StorageIndex* /*bn*/,
+ StorageIndex* /*bk*/) {
+ // By default we do nothing and stick to the block sizes picked by Eigen.
+ }
+};
+
+#if defined(EIGEN_USE_MKLDNN)
+// If all scalar types in tensor contraction are floats, we can use mkldnn gemm
+// as our low level kernel.
+template<typename StorageIndex, typename OutputMapper, typename LhsMapper,
+ typename RhsMapper>
+struct TensorContractionKernel<float, float, float, StorageIndex, OutputMapper,
+ LhsMapper, RhsMapper> {
+ // For now mkldnn has only mkldnn_sgemm (gemm for floats).
+ typedef float Scalar;
+
+ typedef typename internal::gebp_traits<Scalar, Scalar> Traits;
+
+ typedef internal::mkldnn_gemm_pack<Scalar, StorageIndex,
+ typename LhsMapper::SubMapper, ColMajor>
+ LhsPacker;
+
+ typedef internal::mkldnn_gemm_pack<Scalar, StorageIndex,
+ typename RhsMapper::SubMapper, ColMajor>
+ RhsPacker;
+
+ typedef internal::mkldnn_gemm_kernel<Scalar, StorageIndex, OutputMapper>
+ GemmKernel;
+
+ EIGEN_DONT_INLINE
+ static void packLhs(Scalar* lhsBlock,
+ const typename LhsMapper::SubMapper& data_mapper,
+ StorageIndex depth, StorageIndex rows) {
+ LhsPacker()(lhsBlock, data_mapper, rows, depth);
+ }
+
+ EIGEN_DONT_INLINE
+ static void packRhs(Scalar* rhsBlock,
+ const typename RhsMapper::SubMapper& data_mapper,
+ const StorageIndex depth, const StorageIndex cols) {
+ RhsPacker()(rhsBlock, data_mapper, depth, cols);
+ }
+
+ EIGEN_DONT_INLINE
+ static void invoke(const OutputMapper& output_mapper, const Scalar* lhsBlock,
+ const Scalar* rhsBlock, const StorageIndex rows,
+ const StorageIndex depth, const StorageIndex cols,
+ const Scalar alpha) {
+ GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha);
+ }
+};
+
+// For mkldnn_sgemm having the right dimensions (especially for small matrices)
+// is more important than fitting all the working set in L1/L2 caches.
+template<typename StorageIndex>
+struct TensorContractionKernelBlocking<float, float, float, StorageIndex> {
+ // Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48. We pick the largest.
+ static const StorageIndex kUnrollM = 48;
+ // Mkldnn Avx/Avx2/Avx512 unroll factors are: 6/6/8. We pick the closest
+ // number that divides to both of them.
+ static const StorageIndex kUnrollN = 24;
+
+ static void refine(const StorageIndex m,
+ const StorageIndex n,
+ const StorageIndex /*k*/,
+ StorageIndex* bm,
+ StorageIndex* bn,
+ StorageIndex* /*bk*/) {
+ // TODO(ezhulenev): There is probably a better way to pick block sizes.
+ *bm = (std::min)(m, Eigen::divup(*bm, kUnrollM) * kUnrollM);
+ *bn = (std::min)(n, Eigen::divup(*bn, kUnrollN) * kUnrollN);
+ // Stick with default bk.
+ }
+};
+
+#endif // EIGEN_USE_MKLDNN
+} // namespace internal
+
template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> :
public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> > {
@@ -175,6 +346,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
bn = blocking.nc();
bk = blocking.kc();
}
+ // Refine blocking choice to work well with contraction kernel.
+ internal::TensorContractionKernelBlocking<Scalar, LhsScalar, RhsScalar,
+ Index>::refine(m, n, k, &bm,
+ &bn, &bk);
// Number of kernels for each dimension.
Index nm0 = divup(m, bm);
@@ -242,17 +417,12 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
contract_t, internal::packet_traits<RhsScalar>::size,
rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
RhsMapper;
- typedef internal::gemm_pack_lhs<LhsScalar, Index,
- typename LhsMapper::SubMapper, Traits::mr,
- Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor>
- LhsPacker;
- typedef internal::gemm_pack_rhs<
- RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
- RhsPacker;
+
typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
- typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
- Traits::mr, Traits::nr, false, false>
- GebpKernel;
+
+ typedef internal::TensorContractionKernel<
+ Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
+ TensorContractionKernel;
Context(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn,
Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk,
@@ -434,8 +604,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
void pack_lhs(Index m, Index k) {
const Index mend = m * gm_ + gm(m);
for (Index m1 = m * gm_; m1 < mend; m1++)
- LhsPacker()(packed_lhs_[k % (P - 1)][m1],
- lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
+ TensorContractionKernel::packLhs(packed_lhs_[k % (P - 1)][m1],
+ lhs_.getSubMapper(m1 * bm_, k * bk_),
+ bk(k), bm(m1));
if (!parallel_pack_ && shard_by_col_) {
signal_packing(k);
@@ -458,8 +629,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
// deadlocks.
memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
}
- RhsPacker()(packed_rhs_[k % (P - 1)][n1],
- rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
+ TensorContractionKernel::packRhs(packed_rhs_[k % (P - 1)][n1],
+ rhs_.getSubMapper(k * bk_, n1 * bn_),
+ bk(k), bn(n1));
}
if (parallel_pack_ || shard_by_col_) {
@@ -480,9 +652,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
for (Index n1 = n * gn_; n1 < nend; n1++) {
for (Index m1 = m * gm_; m1 < mend; m1++) {
const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
- GebpKernel()(output_mapper, packed_lhs_[k % (P - 1)][m1],
- packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
- Scalar(1), -1, -1, 0, 0);
+ TensorContractionKernel::invoke(
+ output_mapper, packed_lhs_[k % (P - 1)][m1],
+ packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1));
// We are done with the last task for the [m1, n1] block.
if (k + 1 == nk_) {
@@ -495,9 +667,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
for (Index m1 = m * gm_; m1 < mend; m1++)
for (Index n1 = n * gn_; n1 < nend; n1++) {
const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
- GebpKernel()(output_mapper, packed_lhs_[k % (P - 1)][m1],
- packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
- Scalar(1), -1, -1, 0, 0);
+ TensorContractionKernel::invoke(
+ output_mapper, packed_lhs_[k % (P - 1)][m1],
+ packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), Scalar(1));
// We are done with the last task for the [m1, n1] block.
if (k + 1 == nk_) {
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 875842272..25ae1a8d2 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -23,6 +23,17 @@ else(XSMM_FOUND)
ei_add_property(EIGEN_MISSING_BACKENDS "Xsmm, ")
endif(XSMM_FOUND)
+find_package(Mkldnn)
+if(MKLDNN_FOUND)
+ add_definitions("-DEIGEN_USE_MKLDNN")
+ include_directories(${MKLDNN_INCLUDES})
+ link_directories(${MKLDNN_LIBRARIES})
+ set(EXTERNAL_LIBS ${EXTERNAL_LIBS} mkldnn)
+ ei_add_property(EIGEN_TESTED_BACKENDS "Mkldd, ")
+else(MKLDNN_FOUND)
+ ei_add_property(EIGEN_MISSING_BACKENDS "Mkldnn, ")
+endif(MKLDNN_FOUND)
+
find_package(GoogleHash)
if(GOOGLEHASH_FOUND)
add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT")
@@ -190,6 +201,7 @@ if(EIGEN_TEST_CXX11)
ei_add_test(cxx11_tensor_index_list)
ei_add_test(cxx11_tensor_mixed_indices)
ei_add_test(cxx11_tensor_contraction)
+ ei_add_test(cxx11_tensor_contraction_mkldnn)
ei_add_test(cxx11_tensor_convolution)
ei_add_test(cxx11_tensor_expr)
ei_add_test(cxx11_tensor_fixed_size)
diff --git a/unsupported/test/cxx11_tensor_contraction_mkldnn.cpp b/unsupported/test/cxx11_tensor_contraction_mkldnn.cpp
new file mode 100644
index 000000000..5a905c0cf
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_contraction_mkldnn.cpp
@@ -0,0 +1,141 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+// Nothing to test here if we do not have mkldnn enabled.
+#if defined(EIGEN_USE_MKLDNN)
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::ColMajor;
+using Eigen::Tensor;
+using Eigen::Index;
+using Eigen::internal::blas_data_mapper;
+using Eigen::internal::mkldnn_gemm_kernel;
+using Eigen::internal::mkldnn_gemm_pack;
+
+template <int NumDims>
+static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
+ array<Index, NumDims> dims;
+ for (int i = 0; i < NumDims; ++i) {
+ dims[i] = internal::random<int>(min_dim, max_dim);
+ }
+ return dims;
+}
+
+// Packing with mkldnn_gemm_pack is the same as taking a slice of 2 dimensional
+// Tensor.
+template <typename Scalar>
+static void test_mkldnn_gemm_pack() {
+ static const int Options = 0 | ColMajor;
+
+ typedef blas_data_mapper<Scalar, Index, ColMajor> DataMapper;
+ typedef mkldnn_gemm_pack<Scalar, Index, DataMapper, ColMajor> MkldnnGemmPack;
+ typedef Tensor<Scalar, 2, Options, Index> Tensor2d;
+
+ array<Index, 2> dims = RandomDims<2>(1, 500);
+
+ // Create a tensor initialized with random data.
+ Tensor2d src(dims);
+ src.setRandom();
+
+ // Pick a random slice of src tensor.
+ array<Index, 2> slice_start = RandomDims<2>(0, 250);
+ array<Index, 2> slice_size = RandomDims<2>(100, 500);
+ // Make sure that slice start + size do not overflow tensor dims.
+ for (int i = 0; i < 2; ++i) {
+ slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+ slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+ }
+
+ // Prepare tensors for packing and slicing results.
+ Tensor2d pack_dst(slice_size[0], slice_size[1]);
+ Tensor2d slice_dst(slice_size[0], slice_size[1]);
+
+ // Pack memory using mkldnn_gemm_pack.
+ DataMapper data_mapper(src.data(), dims[0]);
+ MkldnnGemmPack gemm_pack;
+ gemm_pack(pack_dst.data(),
+ data_mapper.getSubMapper(slice_start[0], slice_start[1]),
+ slice_size[0], slice_size[1]);
+ // Slice the source tensor.
+ slice_dst = src.slice(slice_start, slice_size);
+
+ // Verify that dst tensors are equal.
+ VERIFY_IS_EQUAL(pack_dst.dimensions().TotalSize(),
+ slice_dst.dimensions().TotalSize());
+ for (Index i = 0; i < pack_dst.dimensions().TotalSize(); ++i) {
+ Scalar packed = pack_dst.coeff(i);
+ Scalar sliced = slice_dst.coeff(i);
+ VERIFY_IS_EQUAL(packed, sliced);
+ }
+}
+template <typename Scalar>
+static void test_mkldnn_gemm_kernel() {
+ static const int Options = 0 | ColMajor;
+
+ typedef Tensor<Scalar, 2, Options, Index> Tensor2d;
+
+ int m = internal::random<int>(1, 100);
+ int n = internal::random<int>(1, 100);
+ int k = internal::random<int>(1, 100);
+
+ Tensor2d lhs(m, k);
+ lhs.setRandom();
+
+ Tensor2d rhs(k, n);
+ rhs.setRandom();
+
+ // Compute matmul with mkldnn gemm kernel.
+ typedef blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+ typedef mkldnn_gemm_kernel<Scalar, Index, OutputMapper, ColMajor>
+ MkldnnGemmKernel;
+
+ Tensor2d mkldnn_result(m, n);
+ mkldnn_result.setZero();
+
+ OutputMapper output_mapper(mkldnn_result.data(), m);
+ MkldnnGemmKernel gemm_kernel;
+ gemm_kernel(output_mapper, lhs.data(), rhs.data(), m, k, n, /*alpha*/ 1.0);
+
+ // Compute matmul with Eigen::Matrix.
+ typedef Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor> Matrix;
+ typedef Map<Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor> > MatrixMap;
+
+ MatrixMap lhs_mat(lhs.data(), m, k);
+ MatrixMap rhs_mat(rhs.data(), k, n);
+ Matrix matmul_result(m, n);
+ matmul_result.setZero();
+
+ matmul_result = lhs_mat * rhs_mat;
+
+ static const float error_threshold = 1e-4f;
+
+ // Verify that results are equal.
+ for (Index i = 0; i < m * n; ++i) {
+ Scalar gemm = mkldnn_result(i);
+ Scalar matmul = matmul_result(i % m, i / m);
+ if ((std::abs)(gemm) > error_threshold &&
+ (std::abs)(matmul) > error_threshold) {
+ if (!Eigen::internal::isApprox(gemm, matmul, error_threshold))
+ std::cout << "gemm=" << gemm << " matmul=" << matmul << std::endl;
+ VERIFY(Eigen::internal::isApprox(gemm, matmul, error_threshold));
+ }
+ }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_contraction_mkldnn) {
+ CALL_SUBTEST(test_mkldnn_gemm_pack<float>());
+ CALL_SUBTEST(test_mkldnn_gemm_kernel<float>());
+}
+#else
+EIGEN_DECLARE_TEST(cxx11_tensor_contraction_mkldnn) {}
+#endif // EIGEN_USE_MKLDNN