aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2017-01-30 15:25:57 -0800
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2017-01-30 15:25:57 -0800
commitfbc39fd02c642119a2c49e517e1cd6e8fa1a008f (patch)
tree6cf2142e4b740eb440c577ca08114e4e24912f91 /unsupported
parent82ce92419e25d8b9902c0f39e2e3b01787bf8687 (diff)
parent63de19c0004933c7b2b1e418292b9f2ae6c138f4 (diff)
Merge latest changes from upstream
Diffstat (limited to 'unsupported')
-rw-r--r--unsupported/CMakeLists.txt10
-rw-r--r--unsupported/Eigen/CXX11/Tensor4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h286
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h134
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h208
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h4
-rw-r--r--unsupported/Eigen/CXX11/src/util/EmulateArray.h12
-rw-r--r--unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h15
-rw-r--r--unsupported/test/CMakeLists.txt11
-rw-r--r--unsupported/test/cxx11_tensor_expr.cpp46
10 files changed, 700 insertions, 30 deletions
diff --git a/unsupported/CMakeLists.txt b/unsupported/CMakeLists.txt
index 4fef40a86..9a5666105 100644
--- a/unsupported/CMakeLists.txt
+++ b/unsupported/CMakeLists.txt
@@ -1,7 +1,9 @@
add_subdirectory(Eigen)
add_subdirectory(doc EXCLUDE_FROM_ALL)
-if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
- add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest
-else()
- add_subdirectory(test EXCLUDE_FROM_ALL)
+if(BUILD_TESTING)
+ if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
+ add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest
+ else()
+ add_subdirectory(test EXCLUDE_FROM_ALL)
+ endif()
endif()
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index f98eb03bd..39916092b 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -71,6 +71,10 @@ typedef unsigned __int64 uint64_t;
#include <time.h>
#endif
+#if defined(EIGEN_USE_LIBXSMM)
+#include "libxsmm.h"
+#endif
+
#ifdef EIGEN_USE_THREADS
#include "ThreadPool"
#endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 1b8017349..828db6d8b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -20,6 +20,70 @@ namespace Eigen {
*
*/
namespace internal {
+#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
+template<typename Scalar, typename Index>
+void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index lddst, Index ldsrc) {
+ size_t psize = packet_traits<Scalar>::size; // Packet size
+ typedef typename packet_traits<Scalar>::type Packet; // Packet type
+ size_t alignment = psize*sizeof(Scalar); // Needed alignment
+ if (rows % psize == 0 && (lddst*sizeof(Scalar)) % alignment == 0 &&
+ (ldsrc*sizeof(Scalar)) % alignment == 0 &&
+ reinterpret_cast<uintptr_t>(src) % alignment == 0 &&
+ reinterpret_cast<uintptr_t>(dst) % alignment == 0) {
+ // Optimized version using packets
+ size_t num_packets = rows / psize;
+ for (Index col = 0; col < cols; ++col) {
+ EIGEN_ASM_COMMENT("begin pack_simple inner copy");
+ // Unrolled manually 4 times.
+ for (size_t i=0; i < num_packets/4; ++i) {
+ internal::pstore(dst, internal::pload<Packet>(src));
+ dst += psize; src += psize;
+ internal::pstore(dst, internal::pload<Packet>(src));
+ dst += psize; src += psize;
+ internal::pstore(dst, internal::pload<Packet>(src));
+ dst += psize; src += psize;
+ internal::pstore(dst, internal::pload<Packet>(src));
+ dst += psize; src += psize;
+ }
+ for (size_t i=0; i < num_packets%4; ++i) {
+ internal::pstore(dst, internal::pload<Packet>(src));
+ dst += psize; src += psize;
+ }
+ dst += lddst - num_packets*psize;
+ src += ldsrc - num_packets*psize;
+ EIGEN_ASM_COMMENT("end pack_simple inner copy");
+ }
+ } else {
+ // Naive memcpy calls
+ for (Index col = 0; col < cols; ++col) {
+ memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar));
+ }
+ }
+}
+
+template<typename LhsScalar, typename RhsScalar, typename Scalar>
+ struct libxsmm_wrapper {
+ libxsmm_wrapper() {}
+ libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) {}
+ void operator()(const LhsScalar* a, const RhsScalar* b, Scalar* c) {}
+ void operator()(const LhsScalar* a, const RhsScalar* b, Scalar* c, const LhsScalar* ap, const RhsScalar* bp, const Scalar* cp) {}
+ };
+
+ template<>
+ struct libxsmm_wrapper<float, float, float>: public libxsmm_mmfunction<float> {
+ libxsmm_wrapper(): libxsmm_mmfunction() {}
+ libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) :
+ libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {}
+ };
+
+ template<>
+ struct libxsmm_wrapper<double, double, double>: public libxsmm_mmfunction<double> {
+ libxsmm_wrapper(): libxsmm_mmfunction() {}
+ libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) :
+ libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {}
+ };
+#endif
+
template<typename Dimensions, typename LhsXprType, typename RhsXprType>
struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
@@ -317,6 +381,8 @@ struct TensorContractionEvaluatorBase
}
}
+ EnableXSMMIfPossible(eval_op_indices);
+
// If the layout is RowMajor, we need to reverse the m_dimensions
if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
@@ -422,6 +488,13 @@ struct TensorContractionEvaluatorBase
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
+ #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
+ if (m_can_use_xsmm) {
+ evalGemmXSMM(buffer);
+ return;
+ }
+ #endif
+
// columns in left side, rows in right side
const Index k = this->m_k_size;
@@ -538,7 +611,212 @@ struct TensorContractionEvaluatorBase
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; }
- protected:
+protected:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void EnableXSMMIfPossible(const array<IndexPair<Index>, ContractDims>& eval_op_indices) {
+ m_can_use_xsmm = false;
+
+#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
+ typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+ typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+ if (!std::is_same<Scalar, LhsScalar>::value ||
+ !std::is_same<Scalar, RhsScalar>::value ||
+ !(std::is_same<Scalar, float>::value ||
+ std::is_same<Scalar, double>::value) ||
+ m_leftImpl.data() == NULL ||
+ m_rightImpl.data() == NULL) {
+ return;
+ }
+
+ // Check if we can use faster matmul algorithms. For contraction to be
+ // equivalent to matmul, we need both lhs and rhs contracting dims sequences
+ // to be either a prefix or suffix of all dims. Also, the order of both
+ // must be the same, so we don't have to do reordering.
+ // For example:
+ // * OK: lhs 4D, rhs 4D, contraction: [(0, 2), (1, 3)]
+ // * BAD: lhs 3D, rhs 3D, contraction: [(1,1)]
+ // * BAD: lhs 3D, rhs 3D, contraction: [(0, 0), (2, 2)]
+ // * BAD: lhs 3D, rhs 3D, contraction: [(0, 2), (1, 1)]
+ // Depending if contraction dims are prefix or suffix of all dims we need to
+ // pre-transpose matrices in matmul algorithm:
+ // lhs: prefix -> transpose, suffix -> no transpose
+ // rhs: prefix -> no transpose, suffix -> transpose
+ // For example, for lhs 2D, rhs 2D, contraction [(1, 0)] is regular,
+ // non-transposed matmul.
+ if (ContractDims == 0) {
+ // This case is totally uninteresting, filter it out to avoid problems
+ // with iterations in further tests.
+ return;
+ }
+
+ // Check if RHS dims list is increasing. LHS already is, so if not, the
+ // order is different and we cannot do matmul.
+ for (int i = 1; i < ContractDims; i++) {
+ if (eval_op_indices[i].second < eval_op_indices[i-1].second) {
+ return;
+ }
+ }
+
+ // Check if no holes.
+ int diff;
+ for (int i = 1; i < ContractDims; i++) {
+ // LHS contract dims are sorted to form an increasing seq.
+ diff = eval_op_indices[i].first - eval_op_indices[i-1].first;
+ if (diff != 1) {
+ return;
+ }
+ // Now we may already assume RHS contract dims seq is increasing too.
+ diff = eval_op_indices[i].second - eval_op_indices[i-1].second;
+ if (diff != 1) {
+ return;
+ }
+ }
+
+ // Check if suffix or prefix.
+ if (eval_op_indices[0].first != 0 &&
+ eval_op_indices[ContractDims-1].first != LDims-1) {
+ return;
+ }
+ if (eval_op_indices[0].second != 0 &&
+ eval_op_indices[ContractDims-1].second != RDims-1) {
+ return;
+ }
+
+ m_can_use_xsmm = true;
+ #endif
+ }
+
+#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
+ EIGEN_DEVICE_FUNC void evalGemmXSMM(Scalar* buffer) const {
+ // columns in left side, rows in right side
+ const Index k = this->m_k_size;
+
+ // rows in left side
+ const Index m = this->m_i_size;
+
+ // columns in right side
+ const Index n = this->m_j_size;
+
+ const bool transposeA = !m_lhs_inner_dim_contiguous;
+ const bool transposeB = !m_rhs_inner_dim_contiguous;
+
+ typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+ typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+
+ internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index> blocking(
+ k, m, n, 1, transposeA, transposeB);
+
+ // Outer blocks sizes
+ const Index mc_outer = blocking.outer_m();
+ const Index nc_outer = blocking.outer_n();
+ const Index kc_outer = blocking.outer_k();
+ // Inner blocks sizes
+ const Index mc = blocking.mc();
+ const Index nc = blocking.nc();
+ const Index kc = blocking.kc();
+ // Decisions whether we should copy parts of matrices
+ const bool copyA = blocking.copyA();
+ const bool copyB = blocking.copyB();
+
+ const LhsScalar* leftData = m_leftImpl.data();
+ const RhsScalar* rightData = m_rightImpl.data();
+
+ const libxsmm_blasint stride_A = static_cast<libxsmm_blasint>(transposeA ? k : m);
+ const libxsmm_blasint stride_B = static_cast<libxsmm_blasint>(transposeB ? n : k);
+ const libxsmm_blasint stride_C = static_cast<libxsmm_blasint>(m);
+
+ const libxsmm_blasint stride_blockA = static_cast<libxsmm_blasint>(mc);
+ // Use bigger stride to avoid hitting same cache line too often.
+ // This consistently gives +~0.5 Gflops.
+ const libxsmm_blasint stride_panelB = static_cast<libxsmm_blasint>(
+ kc % 32 == 0 ? kc + 16 : kc
+ );
+
+ // Kernel for the general case (not edges)
+ internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar> kernel;
+
+ LhsScalar* blockA = NULL;
+ RhsScalar* panelB = NULL;
+
+ if (copyA) {
+ blockA = static_cast<LhsScalar*>(this->m_device.allocate(mc * kc * sizeof(LhsScalar)));
+ }
+ if (copyB) {
+ panelB = static_cast<RhsScalar*>(this->m_device.allocate(nc_outer * stride_panelB * sizeof(RhsScalar)));
+ }
+
+ const Index kernel_stride_A = copyA ? stride_blockA : stride_A;
+ const Index kernel_stride_B = copyB ? stride_panelB : stride_B;
+ kernel = internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, mc, nc, kc, kernel_stride_A, kernel_stride_B, stride_C, 1, 1, blocking.prefetch());
+
+ // Outer blocking
+ for (Index ki_outer = 0; ki_outer < k; ki_outer += kc_outer) {
+ for (Index mi_outer = 0; mi_outer < m; mi_outer += mc_outer) {
+ for (Index ni_outer = 0; ni_outer < n; ni_outer += nc_outer) {
+ using numext::mini;
+
+ Index actual_nc_outer = mini(ni_outer+nc_outer, n) - ni_outer;
+
+ // Inner blocking
+ for (Index ki = ki_outer; ki < mini(ki_outer+kc_outer, k); ki += kc) {
+ const Index actual_kc = mini(ki_outer+kc_outer, mini(ki+kc, k)) - ki;
+ const float beta = ki == 0 ? 0 : 1;
+
+ if (copyB) {
+ if (transposeB) {
+ libxsmm_otrans(panelB, rightData + ki*stride_B + ni_outer, sizeof(RhsScalar), actual_nc_outer, actual_kc, stride_B, stride_panelB);
+ } else {
+ internal::pack_simple<RhsScalar, Index>(panelB, rightData + ni_outer*stride_B + ki, actual_nc_outer, actual_kc, stride_panelB, stride_B);
+ }
+ }
+
+ for (Index mi = mi_outer; mi < mini(mi_outer+mc_outer, m); mi += mc) {
+ const Index actual_mc = mini(mi_outer+mc_outer, mini(mi+mc, m)) - mi;
+
+ const LhsScalar* a = transposeA ? leftData + mi*stride_A + ki :
+ leftData + ki*stride_A + mi;
+
+ if (copyA) {
+ if (transposeA) {
+ libxsmm_otrans(blockA, a, sizeof(LhsScalar), actual_kc, actual_mc, stride_A, stride_blockA);
+ } else {
+ internal::pack_simple<LhsScalar, Index>(blockA, a, actual_kc, actual_mc, stride_blockA, stride_A);
+ }
+ }
+ const LhsScalar* actual_a = copyA ? blockA : a;
+
+ for (Index ni = ni_outer; ni < mini(ni_outer+nc_outer, n); ni += nc) {
+ const Index actual_nc = mini(ni_outer+nc_outer, mini(ni+nc, n)) - ni;
+
+ const RhsScalar* b = rightData + ni*stride_B + ki;
+ Scalar* c = buffer + ni*stride_C + mi;
+ const Scalar* cp = c + nc*stride_C;
+
+ const RhsScalar* actual_b = copyB ? panelB + (ni-ni_outer)*stride_panelB : b;
+ const RhsScalar* bp = copyB ? panelB + nc*stride_panelB : b + nc*stride_B;
+
+ if (actual_mc == mc && actual_kc == kc && actual_nc == nc && beta == 1) {
+ // Most used, cached kernel.
+ kernel(actual_a, actual_b, c, actual_a, bp, cp);
+ } else {
+ // Edges - use libxsmm kernel cache.
+ internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, actual_mc, actual_nc, actual_kc, kernel_stride_A, kernel_stride_B, stride_C, 1, beta, blocking.prefetch())(actual_a, actual_b, c, actual_a, bp, cp);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if (copyA) {
+ this->m_device.deallocate(blockA);
+ }
+ if (copyB) {
+ this->m_device.deallocate(panelB);
+ }
+ }
+#endif
+
// Prevent assignment
TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
Dimensions m_dimensions;
@@ -564,6 +842,11 @@ struct TensorContractionEvaluatorBase
TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
const Device& m_device;
Scalar* m_result;
+
+ /// required for sycl
+ const Indices m_expr_indices;
+
+ bool m_can_use_xsmm;
};
@@ -621,7 +904,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
}
-
};
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index 5cf7b4f71..d34f9caee 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -50,6 +50,140 @@ class TensorContractionBlocking {
};
+
+#if defined(EIGEN_USE_LIBXSMM)
+template <typename LhsScalar, typename RhsScalar, typename Index>
+class TensorXsmmContractionBlocking {
+ public:
+ TensorXsmmContractionBlocking(Index k, Index m, Index n,
+ size_t max_num_threads = 1, bool transposeA = false,
+ bool transposeB = false):
+ k_(k), m_(m), n_(n), transposeA_(transposeA),
+ transposeB_(transposeB), num_threads_(max_num_threads) {
+#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+ if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
+ mc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M;
+ kc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K;
+ nc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N;
+ outer_m_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_M;
+ outer_k_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_K;
+ outer_n_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_N;
+ copyA_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_A;
+ copyB_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_B;
+ outer_m_ = outer_m_ != 0 ? outer_m_ : m;
+ outer_k_ = outer_k_ != 0 ? outer_k_ : k;
+ outer_n_ = outer_n_ != 0 ? outer_n_ : n;
+ }
+#else
+ // Defaults, possibly overriden per-platform.
+ copyA_ = true;
+ copyB_ = false;
+
+ // If the matrix is small enough, don't do blocking, just call single xsmm
+ // kernel.
+ if (static_cast<double>(m)*k*n <= LIBXSMM_THRESHOLD) {
+ mc_ = m; kc_ = k; nc_ = n;
+ outer_m_ = m; outer_k_ = k; outer_n_ = n;
+ copyA_ = false; copyB_ = false;
+ } else {
+ int arch = libxsmm_cpuid_x86();
+
+ if (arch == LIBXSMM_X86_AVX512_CORE) {
+ // skylake
+ mc_ = 64; kc_ = 64; nc_ = 24;
+ outer_m_ = 512; outer_k_ = 512; outer_n_ = 24*22;
+ // Hack to use this kernel architecture as the other one has performance
+ // issues (no hardware prefetching).
+ // TODO(nishantpatil): This should be removed if the issues are fixed,
+ // or this one becomes the default.
+ setenv("LIBXSMM_AVX512_CLASSIC_GEMM", "1", 1);
+ } else if (arch == LIBXSMM_X86_AVX2) {
+ // haswell
+ mc_ = 32; kc_ = 192; nc_ = 33;
+ outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 33*16;
+ } else if (arch == LIBXSMM_X86_AVX) {
+ // ivybridge
+ mc_ = 32; kc_ = 192; nc_ = 48;
+ outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 48*11;
+ } else {
+ // generic kernel size, usually performing well
+ mc_ = 32; kc_ = 128; nc_ = 32;
+ outer_m_ = 512; outer_k_ = 512; outer_n_ = 512;
+ }
+
+ // Only copy if it makes the stride smaller.
+ copyA_ = copyA_ && (m > mc_);
+ copyB_ = copyB_ && (k > kc_);
+ }
+
+ // We need to copy anyway if transposing
+ copyA_ = copyA_ || transposeA;
+ copyB_ = copyB_ || transposeB;
+
+ // See libxsmm_gemm_prefetch_type definition in libxsmm_typedefs.h
+ prefetch_ = LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C;
+
+#endif
+
+ mc_ = mc_ > m ? m : mc_;
+ nc_ = nc_ > n ? n : nc_;
+ kc_ = kc_ > k ? k : kc_;
+
+ size_t compute_parallelism = (m / mc_) * (n / nc_);
+ size_t pack_parallelism = 0;
+ if (copyA_) {
+ pack_parallelism += (m / mc_) * (k / kc_);
+ }
+ if (copyB_) {
+ pack_parallelism += (n / nc_) * (k / kc_);
+ }
+ size_t parallelism = numext::maxi(compute_parallelism, pack_parallelism);
+
+ num_threads_ = numext::mini<size_t>(num_threads_,
+ parallelism / MIN_JOBS_PER_THREAD);
+ num_threads_ = numext::maxi<size_t>(num_threads_, 1);
+
+ // For optimal performance outer block sizes should be multiplies of kernel
+ // sizes, or bigger than matrix size (=no outer blocking).
+ eigen_assert(outer_m_ % mc_ == 0 || outer_m_ >= m);
+ eigen_assert(outer_k_ % kc_ == 0 || outer_k_ >= k);
+ eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n);
+ }
+
+ EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
+ EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
+ EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
+ EIGEN_ALWAYS_INLINE Index outer_k() const { return outer_k_; }
+ EIGEN_ALWAYS_INLINE Index outer_m() const { return outer_m_; }
+ EIGEN_ALWAYS_INLINE Index outer_n() const { return outer_n_; }
+ EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; }
+ EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; }
+ EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; }
+ EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; }
+ EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; }
+ EIGEN_ALWAYS_INLINE Index blocks_m() const { return divup(m_, mc_); }
+ EIGEN_ALWAYS_INLINE Index blocks_k() const { return divup(k_, kc_); }
+ EIGEN_ALWAYS_INLINE Index blocks_n() const { return divup(n_, nc_); }
+ EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const {
+ return prefetch_;
+ }
+
+ private:
+ Index k_, m_, n_;
+ Index kc_, mc_, nc_;
+ Index outer_k_, outer_m_, outer_n_;
+ bool copyA_, copyB_, transposeA_, transposeB_;
+ size_t num_threads_;
+
+ // Threshold for m*k*n to skip blocking and just call libxsmm
+ const double LIBXSMM_THRESHOLD = 80*80*80;
+ // For computing optimal number of threads - so that each thread gets at least
+ // that many jobs.
+ const double MIN_JOBS_PER_THREAD = 3;
+ libxsmm_gemm_prefetch_type prefetch_;
+};
+#endif // EIGEN_USE_LIBXSMM
+
} // end namespace internal
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index ee16cde9b..d30cc96ab 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -116,6 +116,28 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
bool rhs_inner_dim_reordered, int Alignment>
void evalProduct(Scalar* buffer) const {
+ const Index m = this->m_i_size;
+ const Index n = this->m_j_size;
+ const Index k = this->m_k_size;
+ if (m == 0 || n == 0 || k == 0) return;
+
+#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
+ if (this->m_can_use_xsmm) {
+ bool transposeA = !this->m_lhs_inner_dim_contiguous;
+ bool transposeB = !this->m_rhs_inner_dim_contiguous;
+ internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index>
+ blocking(k, m, n, this->m_device.numThreads(), transposeA,
+ transposeB);
+
+ if (blocking.num_threads() == 1) {
+ this->evalGemmXSMM(buffer);
+ } else {
+ ContextXsmm<Alignment>(this, buffer, m, n, k, blocking).run();
+ }
+ return;
+ }
+#endif
+
typedef
typename internal::remove_const<typename EvalLeftArgType::Scalar>::type
LhsScalar;
@@ -147,10 +169,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
Traits::mr, Traits::nr, false, false>
GebpKernel;
- const Index m = this->m_i_size;
- const Index n = this->m_j_size;
- const Index k = this->m_k_size;
- if (m == 0 || n == 0 || k == 0) return;
+
// Compute a set of algorithm parameters:
// - kernel block sizes (bm, bn, bk)
@@ -1044,6 +1063,187 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
rhsCost.dropMemoryCost();
return cost + lhsCost + rhsCost;
}
+
+#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
+ template<int Alignment>
+ class ContextXsmm {
+ public:
+ ContextXsmm(const Self* self, Scalar* buffer, Index m, Index n, Index k,
+ const internal::TensorXsmmContractionBlocking<LhsScalar,
+ RhsScalar, Index>& blocking):
+ device(self->m_device),
+ m(m), k(k), n(n),
+ stride_a(blocking.transposeA() ? k : m),
+ stride_b(blocking.transposeB() ? n : k),
+ stride_c(m),
+ bm(blocking.mc()), bk(blocking.kc()), bn(blocking.nc()),
+ blocks_m(blocking.blocks_m()), blocks_k(blocking.blocks_k()),
+ blocks_n(blocking.blocks_n()),
+ copyA(blocking.copyA()), copyB(blocking.copyB()),
+ transposeA(blocking.transposeA()), transposeB(blocking.transposeB()),
+ num_threads(blocking.num_threads()),
+ buffer(buffer),
+ leftData(self->m_leftImpl.data()), rightData(self->m_rightImpl.data()),
+ workers_done(blocking.num_threads()),
+
+ packingA_jobs(0), packingB_jobs(0), compute_jobs(0),
+ packingA_done(blocking.blocks_m()), packingB_done(blocking.blocks_n()) {}
+
+ void worker() {
+ // Pack
+
+ if (copyA) {
+ while (true) {
+ uint32_t mk = packingA_jobs++;
+ Index mi = mk / blocks_k;
+ Index ki = mk % blocks_k;
+ if (mi >= blocks_m) break;
+
+ LhsScalar * blockA = blocksA + (bk*bm) * (mi*blocks_k+ki);
+ if (transposeA) {
+ const LhsScalar * current_a = leftData + (bm*mi)*stride_a + (bk*ki);
+ libxsmm_otrans(blockA, current_a, sizeof(LhsScalar), actual_bk(ki),
+ actual_bm(mi), stride_a, bm);
+ } else {
+ const LhsScalar * current_a = leftData + (bk*ki)*stride_a + (bm*mi);
+ internal::pack_simple<LhsScalar, Index>(blockA, current_a,
+ actual_bk(ki), actual_bm(mi), bm, stride_a);
+ }
+ packingA_done.at(mi)++;
+ }
+ }
+
+ if (copyB) {
+ while (true) {
+ uint32_t nk = packingB_jobs++;
+ Index ni = nk / blocks_k;
+ Index ki = nk % blocks_k;
+ if (ni >= blocks_n) break;
+
+ RhsScalar * blockB = blocksB + (bk*bn) * (ni*blocks_k+ki);
+ if (transposeB) {
+ const RhsScalar * current_b = rightData + (ki*bk)*stride_b +
+ (ni*bn);
+ libxsmm_otrans(blockB, current_b, sizeof(RhsScalar), actual_bn(ni),
+ actual_bk(ki), stride_b, bk);
+ } else {
+ const RhsScalar * current_b = rightData + (ni*bn)*stride_b +
+ (ki*bk);
+ internal::pack_simple<RhsScalar, Index>(blockB, current_b,
+ actual_bn(ni), actual_bk(ki), bk, stride_b);
+ }
+ packingB_done.at(ni)++;
+ }
+ }
+
+ // Compute
+
+ while (true) {
+ uint32_t mn = compute_jobs++;
+ Index mi = mn / blocks_n;
+ Index ni = mn % blocks_n;
+ if (mi >= blocks_m) break;
+
+ // Wait for mi, ni packings to be done. This is more fine-grained than
+ // waiting for all workers to finish packing.
+ while ((copyA && (packingA_done.at(mi) < blocks_k)) ||
+ (copyB && (packingB_done.at(ni) < blocks_k)))
+ {}
+
+ for (Index ki=0; ki < blocks_k; ++ki) {
+ const LhsScalar * current_a = copyA ?
+ blocksA + (bk*bm) * (mi*blocks_k+ki) :
+ leftData + (bk*ki)*stride_a + (bm*mi);
+ const RhsScalar * current_b = copyB ?
+ blocksB + (bk*bn) * (ni*blocks_k+ki) :
+ rightData + (ni*bn)*stride_b + (bk*ki);
+
+ Index current_stride_a = copyA ? bm : stride_a;
+ Index current_stride_b = copyB ? bk : stride_b;
+
+ // Memory may not be zeroed, overwrite instead of adding in first
+ // iteration.
+ float beta = ki == 0 ? 0 : 1;
+
+ Scalar * current_c = buffer + (mi*bm) + (ni*bn)*stride_c;
+ internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(
+ 0, actual_bm(mi), actual_bn(ni), actual_bk(ki),
+ current_stride_a, current_stride_b, stride_c, 1, beta, 0)
+ (current_a, current_b, current_c);
+ }
+ }
+
+ workers_done.Notify();
+ }
+
+ void run() {
+ // Parallelization strategy.
+ //
+ // First pack A into blocks (sharding by m, k) and B (sharding by n,k),
+ // then shard by m, n.
+ //
+ // Do not use advanced ThreadPool queuing, just run a single long-standing
+ // function in each thread.
+ if (copyA) {
+ blocksA = static_cast<LhsScalar*>(device.allocate(
+ (blocks_m*bm)*(blocks_k*bk)*sizeof(LhsScalar)));
+ }
+ if (copyB) {
+ blocksB = static_cast<RhsScalar*>(device.allocate(
+ (blocks_n*bn)*(blocks_k*bk)*sizeof(RhsScalar)));
+ }
+
+ for (Index i = 0; i < num_threads; ++i) {
+ device.enqueueNoNotification([=]() { worker(); });
+ }
+
+ workers_done.Wait();
+
+ if (copyA) {
+ device.deallocate(blocksA);
+ }
+ if (copyB) {
+ device.deallocate(blocksB);
+ }
+ }
+
+ private:
+ // real block size for block index in [0, ..., blocks - 1].
+ Index actual_bm(Index mi) const {
+ return mi != blocks_m - 1 ? bm : m + bm - bm * blocks_m;
+ }
+ Index actual_bk(Index ki) const {
+ return ki != blocks_k - 1 ? bk : k + bk - bk * blocks_k;
+ }
+ Index actual_bn(Index ni) const {
+ return ni != blocks_n - 1 ? bn : n + bn - bn * blocks_n;
+ }
+
+ const Device& device;
+ Index m, k, n;
+ Index stride_a, stride_b, stride_c;
+ Index bm, bk, bn; // Block sizes.
+ Index blocks_m, blocks_k, blocks_n; // Number of blocks in each dimension.
+ bool copyA, copyB, transposeA, transposeB;
+ Index num_threads;
+ Scalar *buffer;
+ const LhsScalar *leftData;
+ const RhsScalar *rightData;
+
+ LhsScalar *blocksA;
+ RhsScalar *blocksB;
+ // barrier for joining all threads after all done.
+ Barrier workers_done;
+ // "queues" of (mi,ki), (ki,ni), (mi,ni) jobs packed [0,p)x[0,q) -> [0, p*q)
+ std::atomic<uint32_t> packingA_jobs;
+ std::atomic<uint32_t> packingB_jobs;
+ std::atomic<uint32_t> compute_jobs;
+ // already packed blocks for each mi-panel in A and ni-panel in B.
+ std::vector<std::atomic<uint8_t>> packingA_done;
+ std::vector<std::atomic<uint8_t>> packingB_done;
+ };
+#endif
+
};
} // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index 08eb5595a..f060191ab 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -253,7 +253,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
// get data into line_buf
const Index stride = m_strides[dim];
if (stride == 1) {
- memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
+ m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
} else {
Index offset = base_offset;
for (int j = 0; j < line_len; ++j, offset += stride) {
@@ -271,7 +271,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
// write back
if (FFTDir == FFT_FORWARD && stride == 1) {
- memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
+ m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
} else {
Index offset = base_offset;
const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0);
diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 30d3ebcff..03169d591 100644
--- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -200,19 +200,15 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
return a[I];
}
-template <typename T> struct array_size;
template<class T, std::size_t N> struct array_size<array<T,N> > {
static const size_t value = N;
};
-template <typename T> struct array_size;
template<class T, std::size_t N> struct array_size<array<T,N>& > {
static const size_t value = N;
};
-template <typename T> struct array_size;
template<class T, std::size_t N> struct array_size<const array<T,N> > {
static const size_t value = N;
};
-template <typename T> struct array_size;
template<class T, std::size_t N> struct array_size<const array<T,N>& > {
static const size_t value = N;
};
@@ -251,14 +247,6 @@ template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_
#undef STD_GET_ARR_HACK
-template <typename T> struct array_size;
-template<class T, std::size_t N> struct array_size<const std::array<T,N> > {
- static const size_t value = N;
-};
-template <typename T> struct array_size;
-template<class T, std::size_t N> struct array_size<std::array<T,N> > {
- static const size_t value = N;
-};
} // end namespace internal
} // end namespace Eigen
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
index 4bb1852b6..9ad2b9cc8 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
@@ -204,7 +204,8 @@ struct matrix_exp_computeUV
template <typename MatrixType>
struct matrix_exp_computeUV<MatrixType, float>
{
- static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings)
+ template <typename ArgType>
+ static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings)
{
using std::frexp;
using std::pow;
@@ -227,7 +228,8 @@ struct matrix_exp_computeUV<MatrixType, float>
template <typename MatrixType>
struct matrix_exp_computeUV<MatrixType, double>
{
- static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings)
+ template <typename ArgType>
+ static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings)
{
using std::frexp;
using std::pow;
@@ -254,7 +256,8 @@ struct matrix_exp_computeUV<MatrixType, double>
template <typename MatrixType>
struct matrix_exp_computeUV<MatrixType, long double>
{
- static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings)
+ template <typename ArgType>
+ static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings)
{
#if LDBL_MANT_DIG == 53 // double precision
matrix_exp_computeUV<MatrixType, double>::run(arg, U, V, squarings);
@@ -351,11 +354,11 @@ void matrix_exp_compute(const MatrixType& arg, ResultType &result)
return;
}
#endif
- MatrixType U, V;
+ typename MatrixType::PlainObject U, V;
int squarings;
matrix_exp_computeUV<MatrixType>::run(arg, U, V, squarings); // Pade approximant is (U+V) / (-U+V)
- MatrixType numer = U + V;
- MatrixType denom = -U + V;
+ typename MatrixType::PlainObject numer = U + V;
+ typename MatrixType::PlainObject denom = -U + V;
result = denom.partialPivLu().solve(numer);
for (int i=0; i<squarings; i++)
result *= result; // undo scaling by repeated squaring
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index cf07b033d..9fa479f52 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -21,6 +21,17 @@ include_directories(../../test ../../unsupported ../../Eigen
find_package (Threads)
+find_package(Xsmm)
+if(XSMM_FOUND)
+ add_definitions("-DEIGEN_USE_LIBXSMM")
+ include_directories(${XSMM_INCLUDES})
+ link_directories(${XSMM_LIBRARIES})
+ set(EXTERNAL_LIBS ${EXTERNAL_LIBS} xsmm)
+ ei_add_property(EIGEN_TESTED_BACKENDS "Xsmm, ")
+else(XSMM_FOUND)
+ ei_add_property(EIGEN_MISSING_BACKENDS "Xsmm, ")
+endif(XSMM_FOUND)
+
find_package(GoogleHash)
if(GOOGLEHASH_FOUND)
add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT")
diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp
index 77e24cb67..129b4e659 100644
--- a/unsupported/test/cxx11_tensor_expr.cpp
+++ b/unsupported/test/cxx11_tensor_expr.cpp
@@ -300,6 +300,51 @@ static void test_select()
}
}
+template <typename Scalar>
+void test_minmax_nan_propagation_templ() {
+ for (int size = 1; size < 17; ++size) {
+ const Scalar kNan = std::numeric_limits<Scalar>::quiet_NaN();
+ Tensor<Scalar, 1> vec_nan(size);
+ Tensor<Scalar, 1> vec_zero(size);
+ Tensor<Scalar, 1> vec_res(size);
+ vec_nan.setConstant(kNan);
+ vec_zero.setZero();
+ vec_res.setZero();
+
+ // Test that we propagate NaNs in the tensor when applying the
+ // cwiseMax(scalar) operator, which is used for the Relu operator.
+ vec_res = vec_nan.cwiseMax(Scalar(0));
+ for (int i = 0; i < size; ++i) {
+ VERIFY((numext::isnan)(vec_res(i)));
+ }
+
+ // Test that NaNs do not propagate if we reverse the arguments.
+ vec_res = vec_zero.cwiseMax(kNan);
+ for (int i = 0; i < size; ++i) {
+ VERIFY_IS_EQUAL(vec_res(i), Scalar(0));
+ }
+
+ // Test that we propagate NaNs in the tensor when applying the
+ // cwiseMin(scalar) operator.
+ vec_res.setZero();
+ vec_res = vec_nan.cwiseMin(Scalar(0));
+ for (int i = 0; i < size; ++i) {
+ VERIFY((numext::isnan)(vec_res(i)));
+ }
+
+ // Test that NaNs do not propagate if we reverse the arguments.
+ vec_res = vec_zero.cwiseMin(kNan);
+ for (int i = 0; i < size; ++i) {
+ VERIFY_IS_EQUAL(vec_res(i), Scalar(0));
+ }
+ }
+}
+
+static void test_minmax_nan_propagation()
+{
+ test_minmax_nan_propagation_templ<float>();
+ test_minmax_nan_propagation_templ<double>();
+}
void test_cxx11_tensor_expr()
{
@@ -311,4 +356,5 @@ void test_cxx11_tensor_expr()
CALL_SUBTEST(test_functors());
CALL_SUBTEST(test_type_casting());
CALL_SUBTEST(test_select());
+ CALL_SUBTEST(test_minmax_nan_propagation());
}