diff options
Diffstat (limited to 'unsupported/Eigen')
-rw-r--r-- | unsupported/Eigen/CXX11/Tensor | 4 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 286 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h | 134 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h | 208 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 4 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/util/EmulateArray.h | 12 | ||||
-rw-r--r-- | unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h | 15 |
7 files changed, 637 insertions, 26 deletions
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index f98eb03bd..39916092b 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -71,6 +71,10 @@ typedef unsigned __int64 uint64_t; #include <time.h> #endif +#if defined(EIGEN_USE_LIBXSMM) +#include "libxsmm.h" +#endif + #ifdef EIGEN_USE_THREADS #include "ThreadPool" #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 1b8017349..828db6d8b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -20,6 +20,70 @@ namespace Eigen { * */ namespace internal { +#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) +template<typename Scalar, typename Index> +void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index lddst, Index ldsrc) { + size_t psize = packet_traits<Scalar>::size; // Packet size + typedef typename packet_traits<Scalar>::type Packet; // Packet type + size_t alignment = psize*sizeof(Scalar); // Needed alignment + if (rows % psize == 0 && (lddst*sizeof(Scalar)) % alignment == 0 && + (ldsrc*sizeof(Scalar)) % alignment == 0 && + reinterpret_cast<uintptr_t>(src) % alignment == 0 && + reinterpret_cast<uintptr_t>(dst) % alignment == 0) { + // Optimized version using packets + size_t num_packets = rows / psize; + for (Index col = 0; col < cols; ++col) { + EIGEN_ASM_COMMENT("begin pack_simple inner copy"); + // Unrolled manually 4 times. + for (size_t i=0; i < num_packets/4; ++i) { + internal::pstore(dst, internal::pload<Packet>(src)); + dst += psize; src += psize; + internal::pstore(dst, internal::pload<Packet>(src)); + dst += psize; src += psize; + internal::pstore(dst, internal::pload<Packet>(src)); + dst += psize; src += psize; + internal::pstore(dst, internal::pload<Packet>(src)); + dst += psize; src += psize; + } + for (size_t i=0; i < num_packets%4; ++i) { + internal::pstore(dst, internal::pload<Packet>(src)); + dst += psize; src += psize; + } + dst += lddst - num_packets*psize; + src += ldsrc - num_packets*psize; + EIGEN_ASM_COMMENT("end pack_simple inner copy"); + } + } else { + // Naive memcpy calls + for (Index col = 0; col < cols; ++col) { + memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar)); + } + } +} + +template<typename LhsScalar, typename RhsScalar, typename Scalar> + struct libxsmm_wrapper { + libxsmm_wrapper() {} + libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) {} + void operator()(const LhsScalar* a, const RhsScalar* b, Scalar* c) {} + void operator()(const LhsScalar* a, const RhsScalar* b, Scalar* c, const LhsScalar* ap, const RhsScalar* bp, const Scalar* cp) {} + }; + + template<> + struct libxsmm_wrapper<float, float, float>: public libxsmm_mmfunction<float> { + libxsmm_wrapper(): libxsmm_mmfunction() {} + libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) : + libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {} + }; + + template<> + struct libxsmm_wrapper<double, double, double>: public libxsmm_mmfunction<double> { + libxsmm_wrapper(): libxsmm_mmfunction() {} + libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) : + libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {} + }; +#endif + template<typename Dimensions, typename LhsXprType, typename RhsXprType> struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> > @@ -317,6 +381,8 @@ struct TensorContractionEvaluatorBase } } + EnableXSMMIfPossible(eval_op_indices); + // If the layout is RowMajor, we need to reverse the m_dimensions if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) { for (int i = 0, j = NumDims - 1; i < j; i++, j--) { @@ -422,6 +488,13 @@ struct TensorContractionEvaluatorBase template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { + #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) + if (m_can_use_xsmm) { + evalGemmXSMM(buffer); + return; + } + #endif + // columns in left side, rows in right side const Index k = this->m_k_size; @@ -538,7 +611,212 @@ struct TensorContractionEvaluatorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; } - protected: +protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void EnableXSMMIfPossible(const array<IndexPair<Index>, ContractDims>& eval_op_indices) { + m_can_use_xsmm = false; + +#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) + typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; + typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; + if (!std::is_same<Scalar, LhsScalar>::value || + !std::is_same<Scalar, RhsScalar>::value || + !(std::is_same<Scalar, float>::value || + std::is_same<Scalar, double>::value) || + m_leftImpl.data() == NULL || + m_rightImpl.data() == NULL) { + return; + } + + // Check if we can use faster matmul algorithms. For contraction to be + // equivalent to matmul, we need both lhs and rhs contracting dims sequences + // to be either a prefix or suffix of all dims. Also, the order of both + // must be the same, so we don't have to do reordering. + // For example: + // * OK: lhs 4D, rhs 4D, contraction: [(0, 2), (1, 3)] + // * BAD: lhs 3D, rhs 3D, contraction: [(1,1)] + // * BAD: lhs 3D, rhs 3D, contraction: [(0, 0), (2, 2)] + // * BAD: lhs 3D, rhs 3D, contraction: [(0, 2), (1, 1)] + // Depending if contraction dims are prefix or suffix of all dims we need to + // pre-transpose matrices in matmul algorithm: + // lhs: prefix -> transpose, suffix -> no transpose + // rhs: prefix -> no transpose, suffix -> transpose + // For example, for lhs 2D, rhs 2D, contraction [(1, 0)] is regular, + // non-transposed matmul. + if (ContractDims == 0) { + // This case is totally uninteresting, filter it out to avoid problems + // with iterations in further tests. + return; + } + + // Check if RHS dims list is increasing. LHS already is, so if not, the + // order is different and we cannot do matmul. + for (int i = 1; i < ContractDims; i++) { + if (eval_op_indices[i].second < eval_op_indices[i-1].second) { + return; + } + } + + // Check if no holes. + int diff; + for (int i = 1; i < ContractDims; i++) { + // LHS contract dims are sorted to form an increasing seq. + diff = eval_op_indices[i].first - eval_op_indices[i-1].first; + if (diff != 1) { + return; + } + // Now we may already assume RHS contract dims seq is increasing too. + diff = eval_op_indices[i].second - eval_op_indices[i-1].second; + if (diff != 1) { + return; + } + } + + // Check if suffix or prefix. + if (eval_op_indices[0].first != 0 && + eval_op_indices[ContractDims-1].first != LDims-1) { + return; + } + if (eval_op_indices[0].second != 0 && + eval_op_indices[ContractDims-1].second != RDims-1) { + return; + } + + m_can_use_xsmm = true; + #endif + } + +#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) + EIGEN_DEVICE_FUNC void evalGemmXSMM(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + const bool transposeA = !m_lhs_inner_dim_contiguous; + const bool transposeB = !m_rhs_inner_dim_contiguous; + + typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; + typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; + + internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index> blocking( + k, m, n, 1, transposeA, transposeB); + + // Outer blocks sizes + const Index mc_outer = blocking.outer_m(); + const Index nc_outer = blocking.outer_n(); + const Index kc_outer = blocking.outer_k(); + // Inner blocks sizes + const Index mc = blocking.mc(); + const Index nc = blocking.nc(); + const Index kc = blocking.kc(); + // Decisions whether we should copy parts of matrices + const bool copyA = blocking.copyA(); + const bool copyB = blocking.copyB(); + + const LhsScalar* leftData = m_leftImpl.data(); + const RhsScalar* rightData = m_rightImpl.data(); + + const libxsmm_blasint stride_A = static_cast<libxsmm_blasint>(transposeA ? k : m); + const libxsmm_blasint stride_B = static_cast<libxsmm_blasint>(transposeB ? n : k); + const libxsmm_blasint stride_C = static_cast<libxsmm_blasint>(m); + + const libxsmm_blasint stride_blockA = static_cast<libxsmm_blasint>(mc); + // Use bigger stride to avoid hitting same cache line too often. + // This consistently gives +~0.5 Gflops. + const libxsmm_blasint stride_panelB = static_cast<libxsmm_blasint>( + kc % 32 == 0 ? kc + 16 : kc + ); + + // Kernel for the general case (not edges) + internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar> kernel; + + LhsScalar* blockA = NULL; + RhsScalar* panelB = NULL; + + if (copyA) { + blockA = static_cast<LhsScalar*>(this->m_device.allocate(mc * kc * sizeof(LhsScalar))); + } + if (copyB) { + panelB = static_cast<RhsScalar*>(this->m_device.allocate(nc_outer * stride_panelB * sizeof(RhsScalar))); + } + + const Index kernel_stride_A = copyA ? stride_blockA : stride_A; + const Index kernel_stride_B = copyB ? stride_panelB : stride_B; + kernel = internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, mc, nc, kc, kernel_stride_A, kernel_stride_B, stride_C, 1, 1, blocking.prefetch()); + + // Outer blocking + for (Index ki_outer = 0; ki_outer < k; ki_outer += kc_outer) { + for (Index mi_outer = 0; mi_outer < m; mi_outer += mc_outer) { + for (Index ni_outer = 0; ni_outer < n; ni_outer += nc_outer) { + using numext::mini; + + Index actual_nc_outer = mini(ni_outer+nc_outer, n) - ni_outer; + + // Inner blocking + for (Index ki = ki_outer; ki < mini(ki_outer+kc_outer, k); ki += kc) { + const Index actual_kc = mini(ki_outer+kc_outer, mini(ki+kc, k)) - ki; + const float beta = ki == 0 ? 0 : 1; + + if (copyB) { + if (transposeB) { + libxsmm_otrans(panelB, rightData + ki*stride_B + ni_outer, sizeof(RhsScalar), actual_nc_outer, actual_kc, stride_B, stride_panelB); + } else { + internal::pack_simple<RhsScalar, Index>(panelB, rightData + ni_outer*stride_B + ki, actual_nc_outer, actual_kc, stride_panelB, stride_B); + } + } + + for (Index mi = mi_outer; mi < mini(mi_outer+mc_outer, m); mi += mc) { + const Index actual_mc = mini(mi_outer+mc_outer, mini(mi+mc, m)) - mi; + + const LhsScalar* a = transposeA ? leftData + mi*stride_A + ki : + leftData + ki*stride_A + mi; + + if (copyA) { + if (transposeA) { + libxsmm_otrans(blockA, a, sizeof(LhsScalar), actual_kc, actual_mc, stride_A, stride_blockA); + } else { + internal::pack_simple<LhsScalar, Index>(blockA, a, actual_kc, actual_mc, stride_blockA, stride_A); + } + } + const LhsScalar* actual_a = copyA ? blockA : a; + + for (Index ni = ni_outer; ni < mini(ni_outer+nc_outer, n); ni += nc) { + const Index actual_nc = mini(ni_outer+nc_outer, mini(ni+nc, n)) - ni; + + const RhsScalar* b = rightData + ni*stride_B + ki; + Scalar* c = buffer + ni*stride_C + mi; + const Scalar* cp = c + nc*stride_C; + + const RhsScalar* actual_b = copyB ? panelB + (ni-ni_outer)*stride_panelB : b; + const RhsScalar* bp = copyB ? panelB + nc*stride_panelB : b + nc*stride_B; + + if (actual_mc == mc && actual_kc == kc && actual_nc == nc && beta == 1) { + // Most used, cached kernel. + kernel(actual_a, actual_b, c, actual_a, bp, cp); + } else { + // Edges - use libxsmm kernel cache. + internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, actual_mc, actual_nc, actual_kc, kernel_stride_A, kernel_stride_B, stride_C, 1, beta, blocking.prefetch())(actual_a, actual_b, c, actual_a, bp, cp); + } + } + } + } + } + } + } + + if (copyA) { + this->m_device.deallocate(blockA); + } + if (copyB) { + this->m_device.deallocate(panelB); + } + } +#endif + // Prevent assignment TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&); Dimensions m_dimensions; @@ -564,6 +842,11 @@ struct TensorContractionEvaluatorBase TensorEvaluator<EvalRightArgType, Device> m_rightImpl; const Device& m_device; Scalar* m_result; + + /// required for sycl + const Indices m_expr_indices; + + bool m_can_use_xsmm; }; @@ -621,7 +904,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer); } - }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index 5cf7b4f71..d34f9caee 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -50,6 +50,140 @@ class TensorContractionBlocking { }; + +#if defined(EIGEN_USE_LIBXSMM) +template <typename LhsScalar, typename RhsScalar, typename Index> +class TensorXsmmContractionBlocking { + public: + TensorXsmmContractionBlocking(Index k, Index m, Index n, + size_t max_num_threads = 1, bool transposeA = false, + bool transposeB = false): + k_(k), m_(m), n_(n), transposeA_(transposeA), + transposeB_(transposeB), num_threads_(max_num_threads) { +#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES + if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) { + mc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M; + kc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K; + nc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N; + outer_m_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_M; + outer_k_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_K; + outer_n_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_N; + copyA_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_A; + copyB_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_B; + outer_m_ = outer_m_ != 0 ? outer_m_ : m; + outer_k_ = outer_k_ != 0 ? outer_k_ : k; + outer_n_ = outer_n_ != 0 ? outer_n_ : n; + } +#else + // Defaults, possibly overriden per-platform. + copyA_ = true; + copyB_ = false; + + // If the matrix is small enough, don't do blocking, just call single xsmm + // kernel. + if (static_cast<double>(m)*k*n <= LIBXSMM_THRESHOLD) { + mc_ = m; kc_ = k; nc_ = n; + outer_m_ = m; outer_k_ = k; outer_n_ = n; + copyA_ = false; copyB_ = false; + } else { + int arch = libxsmm_cpuid_x86(); + + if (arch == LIBXSMM_X86_AVX512_CORE) { + // skylake + mc_ = 64; kc_ = 64; nc_ = 24; + outer_m_ = 512; outer_k_ = 512; outer_n_ = 24*22; + // Hack to use this kernel architecture as the other one has performance + // issues (no hardware prefetching). + // TODO(nishantpatil): This should be removed if the issues are fixed, + // or this one becomes the default. + setenv("LIBXSMM_AVX512_CLASSIC_GEMM", "1", 1); + } else if (arch == LIBXSMM_X86_AVX2) { + // haswell + mc_ = 32; kc_ = 192; nc_ = 33; + outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 33*16; + } else if (arch == LIBXSMM_X86_AVX) { + // ivybridge + mc_ = 32; kc_ = 192; nc_ = 48; + outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 48*11; + } else { + // generic kernel size, usually performing well + mc_ = 32; kc_ = 128; nc_ = 32; + outer_m_ = 512; outer_k_ = 512; outer_n_ = 512; + } + + // Only copy if it makes the stride smaller. + copyA_ = copyA_ && (m > mc_); + copyB_ = copyB_ && (k > kc_); + } + + // We need to copy anyway if transposing + copyA_ = copyA_ || transposeA; + copyB_ = copyB_ || transposeB; + + // See libxsmm_gemm_prefetch_type definition in libxsmm_typedefs.h + prefetch_ = LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C; + +#endif + + mc_ = mc_ > m ? m : mc_; + nc_ = nc_ > n ? n : nc_; + kc_ = kc_ > k ? k : kc_; + + size_t compute_parallelism = (m / mc_) * (n / nc_); + size_t pack_parallelism = 0; + if (copyA_) { + pack_parallelism += (m / mc_) * (k / kc_); + } + if (copyB_) { + pack_parallelism += (n / nc_) * (k / kc_); + } + size_t parallelism = numext::maxi(compute_parallelism, pack_parallelism); + + num_threads_ = numext::mini<size_t>(num_threads_, + parallelism / MIN_JOBS_PER_THREAD); + num_threads_ = numext::maxi<size_t>(num_threads_, 1); + + // For optimal performance outer block sizes should be multiplies of kernel + // sizes, or bigger than matrix size (=no outer blocking). + eigen_assert(outer_m_ % mc_ == 0 || outer_m_ >= m); + eigen_assert(outer_k_ % kc_ == 0 || outer_k_ >= k); + eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n); + } + + EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } + EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } + EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } + EIGEN_ALWAYS_INLINE Index outer_k() const { return outer_k_; } + EIGEN_ALWAYS_INLINE Index outer_m() const { return outer_m_; } + EIGEN_ALWAYS_INLINE Index outer_n() const { return outer_n_; } + EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; } + EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; } + EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; } + EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; } + EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; } + EIGEN_ALWAYS_INLINE Index blocks_m() const { return divup(m_, mc_); } + EIGEN_ALWAYS_INLINE Index blocks_k() const { return divup(k_, kc_); } + EIGEN_ALWAYS_INLINE Index blocks_n() const { return divup(n_, nc_); } + EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const { + return prefetch_; + } + + private: + Index k_, m_, n_; + Index kc_, mc_, nc_; + Index outer_k_, outer_m_, outer_n_; + bool copyA_, copyB_, transposeA_, transposeB_; + size_t num_threads_; + + // Threshold for m*k*n to skip blocking and just call libxsmm + const double LIBXSMM_THRESHOLD = 80*80*80; + // For computing optimal number of threads - so that each thread gets at least + // that many jobs. + const double MIN_JOBS_PER_THREAD = 3; + libxsmm_gemm_prefetch_type prefetch_; +}; +#endif // EIGEN_USE_LIBXSMM + } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index ee16cde9b..d30cc96ab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -116,6 +116,28 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> void evalProduct(Scalar* buffer) const { + const Index m = this->m_i_size; + const Index n = this->m_j_size; + const Index k = this->m_k_size; + if (m == 0 || n == 0 || k == 0) return; + +#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) + if (this->m_can_use_xsmm) { + bool transposeA = !this->m_lhs_inner_dim_contiguous; + bool transposeB = !this->m_rhs_inner_dim_contiguous; + internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index> + blocking(k, m, n, this->m_device.numThreads(), transposeA, + transposeB); + + if (blocking.num_threads() == 1) { + this->evalGemmXSMM(buffer); + } else { + ContextXsmm<Alignment>(this, buffer, m, n, k, blocking).run(); + } + return; + } +#endif + typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; @@ -147,10 +169,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT Traits::mr, Traits::nr, false, false> GebpKernel; - const Index m = this->m_i_size; - const Index n = this->m_j_size; - const Index k = this->m_k_size; - if (m == 0 || n == 0 || k == 0) return; + // Compute a set of algorithm parameters: // - kernel block sizes (bm, bn, bk) @@ -1044,6 +1063,187 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT rhsCost.dropMemoryCost(); return cost + lhsCost + rhsCost; } + +#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) + template<int Alignment> + class ContextXsmm { + public: + ContextXsmm(const Self* self, Scalar* buffer, Index m, Index n, Index k, + const internal::TensorXsmmContractionBlocking<LhsScalar, + RhsScalar, Index>& blocking): + device(self->m_device), + m(m), k(k), n(n), + stride_a(blocking.transposeA() ? k : m), + stride_b(blocking.transposeB() ? n : k), + stride_c(m), + bm(blocking.mc()), bk(blocking.kc()), bn(blocking.nc()), + blocks_m(blocking.blocks_m()), blocks_k(blocking.blocks_k()), + blocks_n(blocking.blocks_n()), + copyA(blocking.copyA()), copyB(blocking.copyB()), + transposeA(blocking.transposeA()), transposeB(blocking.transposeB()), + num_threads(blocking.num_threads()), + buffer(buffer), + leftData(self->m_leftImpl.data()), rightData(self->m_rightImpl.data()), + workers_done(blocking.num_threads()), + + packingA_jobs(0), packingB_jobs(0), compute_jobs(0), + packingA_done(blocking.blocks_m()), packingB_done(blocking.blocks_n()) {} + + void worker() { + // Pack + + if (copyA) { + while (true) { + uint32_t mk = packingA_jobs++; + Index mi = mk / blocks_k; + Index ki = mk % blocks_k; + if (mi >= blocks_m) break; + + LhsScalar * blockA = blocksA + (bk*bm) * (mi*blocks_k+ki); + if (transposeA) { + const LhsScalar * current_a = leftData + (bm*mi)*stride_a + (bk*ki); + libxsmm_otrans(blockA, current_a, sizeof(LhsScalar), actual_bk(ki), + actual_bm(mi), stride_a, bm); + } else { + const LhsScalar * current_a = leftData + (bk*ki)*stride_a + (bm*mi); + internal::pack_simple<LhsScalar, Index>(blockA, current_a, + actual_bk(ki), actual_bm(mi), bm, stride_a); + } + packingA_done.at(mi)++; + } + } + + if (copyB) { + while (true) { + uint32_t nk = packingB_jobs++; + Index ni = nk / blocks_k; + Index ki = nk % blocks_k; + if (ni >= blocks_n) break; + + RhsScalar * blockB = blocksB + (bk*bn) * (ni*blocks_k+ki); + if (transposeB) { + const RhsScalar * current_b = rightData + (ki*bk)*stride_b + + (ni*bn); + libxsmm_otrans(blockB, current_b, sizeof(RhsScalar), actual_bn(ni), + actual_bk(ki), stride_b, bk); + } else { + const RhsScalar * current_b = rightData + (ni*bn)*stride_b + + (ki*bk); + internal::pack_simple<RhsScalar, Index>(blockB, current_b, + actual_bn(ni), actual_bk(ki), bk, stride_b); + } + packingB_done.at(ni)++; + } + } + + // Compute + + while (true) { + uint32_t mn = compute_jobs++; + Index mi = mn / blocks_n; + Index ni = mn % blocks_n; + if (mi >= blocks_m) break; + + // Wait for mi, ni packings to be done. This is more fine-grained than + // waiting for all workers to finish packing. + while ((copyA && (packingA_done.at(mi) < blocks_k)) || + (copyB && (packingB_done.at(ni) < blocks_k))) + {} + + for (Index ki=0; ki < blocks_k; ++ki) { + const LhsScalar * current_a = copyA ? + blocksA + (bk*bm) * (mi*blocks_k+ki) : + leftData + (bk*ki)*stride_a + (bm*mi); + const RhsScalar * current_b = copyB ? + blocksB + (bk*bn) * (ni*blocks_k+ki) : + rightData + (ni*bn)*stride_b + (bk*ki); + + Index current_stride_a = copyA ? bm : stride_a; + Index current_stride_b = copyB ? bk : stride_b; + + // Memory may not be zeroed, overwrite instead of adding in first + // iteration. + float beta = ki == 0 ? 0 : 1; + + Scalar * current_c = buffer + (mi*bm) + (ni*bn)*stride_c; + internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>( + 0, actual_bm(mi), actual_bn(ni), actual_bk(ki), + current_stride_a, current_stride_b, stride_c, 1, beta, 0) + (current_a, current_b, current_c); + } + } + + workers_done.Notify(); + } + + void run() { + // Parallelization strategy. + // + // First pack A into blocks (sharding by m, k) and B (sharding by n,k), + // then shard by m, n. + // + // Do not use advanced ThreadPool queuing, just run a single long-standing + // function in each thread. + if (copyA) { + blocksA = static_cast<LhsScalar*>(device.allocate( + (blocks_m*bm)*(blocks_k*bk)*sizeof(LhsScalar))); + } + if (copyB) { + blocksB = static_cast<RhsScalar*>(device.allocate( + (blocks_n*bn)*(blocks_k*bk)*sizeof(RhsScalar))); + } + + for (Index i = 0; i < num_threads; ++i) { + device.enqueueNoNotification([=]() { worker(); }); + } + + workers_done.Wait(); + + if (copyA) { + device.deallocate(blocksA); + } + if (copyB) { + device.deallocate(blocksB); + } + } + + private: + // real block size for block index in [0, ..., blocks - 1]. + Index actual_bm(Index mi) const { + return mi != blocks_m - 1 ? bm : m + bm - bm * blocks_m; + } + Index actual_bk(Index ki) const { + return ki != blocks_k - 1 ? bk : k + bk - bk * blocks_k; + } + Index actual_bn(Index ni) const { + return ni != blocks_n - 1 ? bn : n + bn - bn * blocks_n; + } + + const Device& device; + Index m, k, n; + Index stride_a, stride_b, stride_c; + Index bm, bk, bn; // Block sizes. + Index blocks_m, blocks_k, blocks_n; // Number of blocks in each dimension. + bool copyA, copyB, transposeA, transposeB; + Index num_threads; + Scalar *buffer; + const LhsScalar *leftData; + const RhsScalar *rightData; + + LhsScalar *blocksA; + RhsScalar *blocksB; + // barrier for joining all threads after all done. + Barrier workers_done; + // "queues" of (mi,ki), (ki,ni), (mi,ni) jobs packed [0,p)x[0,q) -> [0, p*q) + std::atomic<uint32_t> packingA_jobs; + std::atomic<uint32_t> packingB_jobs; + std::atomic<uint32_t> compute_jobs; + // already packed blocks for each mi-panel in A and ni-panel in B. + std::vector<std::atomic<uint8_t>> packingA_done; + std::vector<std::atomic<uint8_t>> packingB_done; + }; +#endif + }; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 08eb5595a..f060191ab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -253,7 +253,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D // get data into line_buf const Index stride = m_strides[dim]; if (stride == 1) { - memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); + m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); } else { Index offset = base_offset; for (int j = 0; j < line_len; ++j, offset += stride) { @@ -271,7 +271,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D // write back if (FFTDir == FFT_FORWARD && stride == 1) { - memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); + m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); } else { Index offset = base_offset; const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0); diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h index 30d3ebcff..03169d591 100644 --- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h @@ -200,19 +200,15 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) { return a[I]; } -template <typename T> struct array_size; template<class T, std::size_t N> struct array_size<array<T,N> > { static const size_t value = N; }; -template <typename T> struct array_size; template<class T, std::size_t N> struct array_size<array<T,N>& > { static const size_t value = N; }; -template <typename T> struct array_size; template<class T, std::size_t N> struct array_size<const array<T,N> > { static const size_t value = N; }; -template <typename T> struct array_size; template<class T, std::size_t N> struct array_size<const array<T,N>& > { static const size_t value = N; }; @@ -251,14 +247,6 @@ template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_ #undef STD_GET_ARR_HACK -template <typename T> struct array_size; -template<class T, std::size_t N> struct array_size<const std::array<T,N> > { - static const size_t value = N; -}; -template <typename T> struct array_size; -template<class T, std::size_t N> struct array_size<std::array<T,N> > { - static const size_t value = N; -}; } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h index 4bb1852b6..9ad2b9cc8 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h @@ -204,7 +204,8 @@ struct matrix_exp_computeUV template <typename MatrixType> struct matrix_exp_computeUV<MatrixType, float> { - static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings) + template <typename ArgType> + static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings) { using std::frexp; using std::pow; @@ -227,7 +228,8 @@ struct matrix_exp_computeUV<MatrixType, float> template <typename MatrixType> struct matrix_exp_computeUV<MatrixType, double> { - static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings) + template <typename ArgType> + static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings) { using std::frexp; using std::pow; @@ -254,7 +256,8 @@ struct matrix_exp_computeUV<MatrixType, double> template <typename MatrixType> struct matrix_exp_computeUV<MatrixType, long double> { - static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings) + template <typename ArgType> + static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings) { #if LDBL_MANT_DIG == 53 // double precision matrix_exp_computeUV<MatrixType, double>::run(arg, U, V, squarings); @@ -351,11 +354,11 @@ void matrix_exp_compute(const MatrixType& arg, ResultType &result) return; } #endif - MatrixType U, V; + typename MatrixType::PlainObject U, V; int squarings; matrix_exp_computeUV<MatrixType>::run(arg, U, V, squarings); // Pade approximant is (U+V) / (-U+V) - MatrixType numer = U + V; - MatrixType denom = -U + V; + typename MatrixType::PlainObject numer = U + V; + typename MatrixType::PlainObject denom = -U + V; result = denom.partialPivLu().solve(numer); for (int i=0; i<squarings; i++) result *= result; // undo scaling by repeated squaring |