diff options
author | Eugene Zhulenev <ezhulenev@google.com> | 2019-08-19 11:44:25 -0700 |
---|---|---|
committer | Eugene Zhulenev <ezhulenev@google.com> | 2019-08-19 11:44:25 -0700 |
commit | 071311821e509d87bec609d6a3aeea9dc74cfd66 (patch) | |
tree | 686922520a522c508d6074396a0303545f1b09d5 | |
parent | d55d392e7b1136655b4223bea8e99cb2fe0a8afd (diff) |
Remove XSMM support from Tensor module
-rw-r--r-- | cmake/FindXsmm.cmake | 25 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/Tensor | 4 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 281 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h | 135 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h | 201 | ||||
-rw-r--r-- | unsupported/test/CMakeLists.txt | 11 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_contraction.cpp | 7 |
7 files changed, 0 insertions, 664 deletions
diff --git a/cmake/FindXsmm.cmake b/cmake/FindXsmm.cmake deleted file mode 100644 index 809d6f414..000000000 --- a/cmake/FindXsmm.cmake +++ /dev/null @@ -1,25 +0,0 @@ -# libxsmm support. -# libxsmm provides matrix multiplication kernels optimized for -# the latest Intel architectures. -# Download the library from https://github.com/hfp/libxsmm -# Compile with make BLAS=0 - -if (LIBXSMM) - set(XSMM_FIND_QUIETLY TRUE) - set(XSMM_INCLUDES ${LIBXSMM}/include) - set(XSMM_LIBRARIES ${LIBXSMM}/lib) -endif (LIBXSMM) - -find_path(LIBXSMM - NAMES - libxsmm.h - PATHS - $ENV{XSMMDIR}/include - ${INCLUDE_INSTALL_DIR} -) - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(XSMM DEFAULT_MSG - LIBXSMM) - -mark_as_advanced(LIBXSMM) diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 25b663046..5d18aeb3f 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -73,10 +73,6 @@ typedef unsigned __int64 uint64_t; #include <time.h> #endif -#if defined(EIGEN_USE_LIBXSMM) -#include "libxsmm.h" -#endif - #ifdef EIGEN_USE_THREADS #include "ThreadPool" #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index de7c2248a..a398b2b3f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -20,70 +20,6 @@ namespace Eigen { * */ namespace internal { -#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) -template<typename Scalar, typename Index> -void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index lddst, Index ldsrc) { - size_t psize = packet_traits<Scalar>::size; // Packet size - typedef typename packet_traits<Scalar>::type Packet; // Packet type - size_t alignment = psize*sizeof(Scalar); // Needed alignment - if (rows % psize == 0 && (lddst*sizeof(Scalar)) % alignment == 0 && - (ldsrc*sizeof(Scalar)) % alignment == 0 && - reinterpret_cast<uintptr_t>(src) % alignment == 0 && - reinterpret_cast<uintptr_t>(dst) % alignment == 0) { - // Optimized version using packets - size_t num_packets = rows / psize; - for (Index col = 0; col < cols; ++col) { - EIGEN_ASM_COMMENT("begin pack_simple inner copy"); - // Unrolled manually 4 times. - for (size_t i=0; i < num_packets/4; ++i) { - internal::pstore(dst, internal::pload<Packet>(src)); - dst += psize; src += psize; - internal::pstore(dst, internal::pload<Packet>(src)); - dst += psize; src += psize; - internal::pstore(dst, internal::pload<Packet>(src)); - dst += psize; src += psize; - internal::pstore(dst, internal::pload<Packet>(src)); - dst += psize; src += psize; - } - for (size_t i=0; i < num_packets%4; ++i) { - internal::pstore(dst, internal::pload<Packet>(src)); - dst += psize; src += psize; - } - dst += lddst - num_packets*psize; - src += ldsrc - num_packets*psize; - EIGEN_ASM_COMMENT("end pack_simple inner copy"); - } - } else { - // Naive memcpy calls - for (Index col = 0; col < cols; ++col) { - memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar)); - } - } -} - -template<typename LhsScalar, typename RhsScalar, typename Scalar> - struct libxsmm_wrapper { - libxsmm_wrapper() {} - libxsmm_wrapper(int, int, int, int, int, int, int, float, float, int) {} - void operator()(const LhsScalar*, const RhsScalar*, Scalar*) {} - void operator()(const LhsScalar*, const RhsScalar*, Scalar*, const LhsScalar*, const RhsScalar*, const Scalar*) {} - }; - - template<> - struct libxsmm_wrapper<float, float, float>: public libxsmm_mmfunction<float> { - libxsmm_wrapper(): libxsmm_mmfunction() {} - libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) : - libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {} - }; - - template<> - struct libxsmm_wrapper<double, double, double>: public libxsmm_mmfunction<double> { - libxsmm_wrapper(): libxsmm_mmfunction() {} - libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) : - libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {} - }; -#endif - template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType> struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> > @@ -640,8 +576,6 @@ struct TensorContractionEvaluatorBase } } - EnableXSMMIfPossible(eval_op_indices); - // If the layout is RowMajor, we need to reverse the m_dimensions if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) { for (int i = 0, j = NumDims - 1; i < j; i++, j--) { @@ -780,13 +714,6 @@ struct TensorContractionEvaluatorBase EIGEN_DEVICE_FUNC #endif void evalGemm(Scalar* buffer) const { - #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) - if (m_can_use_xsmm) { - evalGemmXSMM(buffer); - return; - } - #endif - // columns in left side, rows in right side const Index k = this->m_k_size; @@ -942,213 +869,6 @@ struct TensorContractionEvaluatorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; } protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void EnableXSMMIfPossible(const array<IndexPair<Index>, ContractDims>& eval_op_indices) { - m_can_use_xsmm = false; - -#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) - typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; - typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; - if (!std::is_same<Scalar, LhsScalar>::value || - !std::is_same<Scalar, RhsScalar>::value || - !(std::is_same<Scalar, float>::value || - std::is_same<Scalar, double>::value) || - m_leftImpl.data() == NULL || - m_rightImpl.data() == NULL) { - return; - } - - // Check if we can use faster matmul algorithms. For contraction to be - // equivalent to matmul, we need both lhs and rhs contracting dims sequences - // to be either a prefix or suffix of all dims. Also, the order of both - // must be the same, so we don't have to do reordering. - // For example: - // * OK: lhs 4D, rhs 4D, contraction: [(0, 2), (1, 3)] - // * BAD: lhs 3D, rhs 3D, contraction: [(1,1)] - // * BAD: lhs 3D, rhs 3D, contraction: [(0, 0), (2, 2)] - // * BAD: lhs 3D, rhs 3D, contraction: [(0, 2), (1, 1)] - // Depending if contraction dims are prefix or suffix of all dims we need to - // pre-transpose matrices in matmul algorithm: - // lhs: prefix -> transpose, suffix -> no transpose - // rhs: prefix -> no transpose, suffix -> transpose - // For example, for lhs 2D, rhs 2D, contraction [(1, 0)] is regular, - // non-transposed matmul. - if (ContractDims == 0) { - // This case is totally uninteresting, filter it out to avoid problems - // with iterations in further tests. - return; - } - - // Check if RHS dims list is increasing. LHS already is, so if not, the - // order is different and we cannot do matmul. - for (int i = 1; i < ContractDims; i++) { - if (eval_op_indices[i].second < eval_op_indices[i-1].second) { - return; - } - } - - // Check if no holes. - int diff; - for (int i = 1; i < ContractDims; i++) { - // LHS contract dims are sorted to form an increasing seq. - diff = eval_op_indices[i].first - eval_op_indices[i-1].first; - if (diff != 1) { - return; - } - // Now we may already assume RHS contract dims seq is increasing too. - diff = eval_op_indices[i].second - eval_op_indices[i-1].second; - if (diff != 1) { - return; - } - } - - // Check if suffix or prefix. - if (eval_op_indices[0].first != 0 && - eval_op_indices[ContractDims-1].first != LDims-1) { - return; - } - if (eval_op_indices[0].second != 0 && - eval_op_indices[ContractDims-1].second != RDims-1) { - return; - } - - m_can_use_xsmm = true; -#else - EIGEN_UNUSED_VARIABLE(eval_op_indices); -#endif - } - -#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) - EIGEN_DEVICE_FUNC void evalGemmXSMM(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - const bool transposeA = !m_lhs_inner_dim_contiguous; - const bool transposeB = !m_rhs_inner_dim_contiguous; - - typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; - typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; - - internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index> blocking( - k, m, n, 1, transposeA, transposeB); - - // Outer blocks sizes - const Index mc_outer = blocking.outer_m(); - const Index nc_outer = blocking.outer_n(); - const Index kc_outer = blocking.outer_k(); - // Inner blocks sizes - const Index mc = blocking.mc(); - const Index nc = blocking.nc(); - const Index kc = blocking.kc(); - // Decisions whether we should copy parts of matrices - const bool copyA = blocking.copyA(); - const bool copyB = blocking.copyB(); - - const LhsScalar* leftData = m_leftImpl.data(); - const RhsScalar* rightData = m_rightImpl.data(); - - const libxsmm_blasint stride_A = static_cast<libxsmm_blasint>(transposeA ? k : m); - const libxsmm_blasint stride_B = static_cast<libxsmm_blasint>(transposeB ? n : k); - const libxsmm_blasint stride_C = static_cast<libxsmm_blasint>(m); - - const libxsmm_blasint stride_blockA = static_cast<libxsmm_blasint>(mc); - // Use bigger stride to avoid hitting same cache line too often. - // This consistently gives +~0.5 Gflops. - const libxsmm_blasint stride_panelB = static_cast<libxsmm_blasint>( - kc % 32 == 0 ? kc + 16 : kc - ); - - // Kernel for the general case (not edges) - internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar> kernel; - - LhsScalar* blockA = NULL; - RhsScalar* panelB = NULL; - - if (copyA) { - blockA = static_cast<LhsScalar*>(this->m_device.allocate(mc * kc * sizeof(LhsScalar))); - } - if (copyB) { - panelB = static_cast<RhsScalar*>(this->m_device.allocate(nc_outer * stride_panelB * sizeof(RhsScalar))); - } - - const Index kernel_stride_A = copyA ? stride_blockA : stride_A; - const Index kernel_stride_B = copyB ? stride_panelB : stride_B; - kernel = internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, mc, nc, kc, kernel_stride_A, kernel_stride_B, stride_C, 1, 1, blocking.prefetch()); - - // Outer blocking - for (Index ki_outer = 0; ki_outer < k; ki_outer += kc_outer) { - for (Index mi_outer = 0; mi_outer < m; mi_outer += mc_outer) { - for (Index ni_outer = 0; ni_outer < n; ni_outer += nc_outer) { - using numext::mini; - - Index actual_nc_outer = mini(ni_outer+nc_outer, n) - ni_outer; - - // Inner blocking - for (Index ki = ki_outer; ki < mini(ki_outer+kc_outer, k); ki += kc) { - const Index actual_kc = mini(ki_outer+kc_outer, mini(ki+kc, k)) - ki; - const float beta = ki == 0 ? 0 : 1; - - if (copyB) { - if (transposeB) { - libxsmm_otrans(panelB, rightData + ki*stride_B + ni_outer, sizeof(RhsScalar), actual_nc_outer, actual_kc, stride_B, stride_panelB); - } else { - internal::pack_simple<RhsScalar, Index>(panelB, rightData + ni_outer*stride_B + ki, actual_nc_outer, actual_kc, stride_panelB, stride_B); - } - } - - for (Index mi = mi_outer; mi < mini(mi_outer+mc_outer, m); mi += mc) { - const Index actual_mc = mini(mi_outer+mc_outer, mini(mi+mc, m)) - mi; - - const LhsScalar* a = transposeA ? leftData + mi*stride_A + ki : - leftData + ki*stride_A + mi; - - if (copyA) { - if (transposeA) { - libxsmm_otrans(blockA, a, sizeof(LhsScalar), actual_kc, actual_mc, stride_A, stride_blockA); - } else { - internal::pack_simple<LhsScalar, Index>(blockA, a, actual_kc, actual_mc, stride_blockA, stride_A); - } - } - const LhsScalar* actual_a = copyA ? blockA : a; - - for (Index ni = ni_outer; ni < mini(ni_outer+nc_outer, n); ni += nc) { - const Index actual_nc = mini(ni_outer+nc_outer, mini(ni+nc, n)) - ni; - - const RhsScalar* b = rightData + ni*stride_B + ki; - Scalar* c = buffer + ni*stride_C + mi; - const Scalar* cp = c + nc*stride_C; - - const RhsScalar* actual_b = copyB ? panelB + (ni-ni_outer)*stride_panelB : b; - const RhsScalar* bp = copyB ? panelB + nc*stride_panelB : b + nc*stride_B; - - if (actual_mc == mc && actual_kc == kc && actual_nc == nc && beta == 1) { - // Most used, cached kernel. - kernel(actual_a, actual_b, c, actual_a, bp, cp); - } else { - // Edges - use libxsmm kernel cache. - internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, actual_mc, actual_nc, actual_kc, kernel_stride_A, kernel_stride_B, stride_C, 1, beta, blocking.prefetch())(actual_a, actual_b, c, actual_a, bp, cp); - } - } - } - } - } - } - } - - if (copyA) { - this->m_device.deallocate(blockA); - } - if (copyB) { - this->m_device.deallocate(panelB); - } - } -#endif - // Prevent assignment TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&); Dimensions m_dimensions; @@ -1177,7 +897,6 @@ protected: const Device EIGEN_DEVICE_REF m_device; OutputKernelType m_output_kernel; EvaluatorPointerType m_result; - bool m_can_use_xsmm; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index c51f3f8dd..974feb0ad 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -67,141 +67,6 @@ class TensorContractionBlocking { StorageIndex nc_; }; - - -#if defined(EIGEN_USE_LIBXSMM) -template <typename LhsScalar, typename RhsScalar, typename StorageIndex> -class TensorXsmmContractionBlocking { - public: - TensorXsmmContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, - size_t max_num_threads = 1, bool transposeA = false, - bool transposeB = false): - k_(k), m_(m), n_(n), transposeA_(transposeA), - transposeB_(transposeB), num_threads_(max_num_threads) { -#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES - if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) { - mc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M; - kc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K; - nc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N; - outer_m_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_M; - outer_k_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_K; - outer_n_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_N; - copyA_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_A; - copyB_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_B; - outer_m_ = outer_m_ != 0 ? outer_m_ : m; - outer_k_ = outer_k_ != 0 ? outer_k_ : k; - outer_n_ = outer_n_ != 0 ? outer_n_ : n; - } -#else - // Defaults, possibly overridden per-platform. - copyA_ = true; - copyB_ = false; - - // If the matrix is small enough, don't do blocking, just call single xsmm - // kernel. - if (static_cast<double>(m)*k*n <= LIBXSMM_THRESHOLD) { - mc_ = m; kc_ = k; nc_ = n; - outer_m_ = m; outer_k_ = k; outer_n_ = n; - copyA_ = false; copyB_ = false; - } else { - int arch = libxsmm_cpuid_x86(); - - if (arch == LIBXSMM_X86_AVX512_CORE) { - // skylake - mc_ = 64; kc_ = 64; nc_ = 24; - outer_m_ = 512; outer_k_ = 512; outer_n_ = 24*22; - // Hack to use this kernel architecture as the other one has performance - // issues (no hardware prefetching). - // TODO(nishantpatil): This should be removed if the issues are fixed, - // or this one becomes the default. - setenv("LIBXSMM_AVX512_CLASSIC_GEMM", "1", 1); - } else if (arch == LIBXSMM_X86_AVX2) { - // haswell - mc_ = 32; kc_ = 192; nc_ = 33; - outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 33*16; - } else if (arch == LIBXSMM_X86_AVX) { - // ivybridge - mc_ = 32; kc_ = 192; nc_ = 48; - outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 48*11; - } else { - // generic kernel size, usually performing well - mc_ = 32; kc_ = 128; nc_ = 32; - outer_m_ = 512; outer_k_ = 512; outer_n_ = 512; - } - - // Only copy if it makes the stride smaller. - copyA_ = copyA_ && (m > mc_); - copyB_ = copyB_ && (k > kc_); - } - - // We need to copy anyway if transposing - copyA_ = copyA_ || transposeA; - copyB_ = copyB_ || transposeB; - - // See libxsmm_gemm_prefetch_type definition in libxsmm_typedefs.h - prefetch_ = LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C; - -#endif - - mc_ = mc_ > m ? m : mc_; - nc_ = nc_ > n ? n : nc_; - kc_ = kc_ > k ? k : kc_; - - size_t compute_parallelism = (m / mc_) * (n / nc_); - size_t pack_parallelism = 0; - if (copyA_) { - pack_parallelism += (m / mc_) * (k / kc_); - } - if (copyB_) { - pack_parallelism += (n / nc_) * (k / kc_); - } - size_t parallelism = numext::maxi(compute_parallelism, pack_parallelism); - - num_threads_ = numext::mini<size_t>(num_threads_, - parallelism / MIN_JOBS_PER_THREAD); - num_threads_ = numext::maxi<size_t>(num_threads_, 1); - - // For optimal performance outer block sizes should be multiplies of kernel - // sizes, or bigger than matrix size (=no outer blocking). - eigen_assert(outer_m_ % mc_ == 0 || outer_m_ >= m); - eigen_assert(outer_k_ % kc_ == 0 || outer_k_ >= k); - eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n); - } - - EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; } - EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; } - EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; } - EIGEN_ALWAYS_INLINE StorageIndex outer_k() const { return outer_k_; } - EIGEN_ALWAYS_INLINE StorageIndex outer_m() const { return outer_m_; } - EIGEN_ALWAYS_INLINE StorageIndex outer_n() const { return outer_n_; } - EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; } - EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; } - EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; } - EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; } - EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; } - EIGEN_ALWAYS_INLINE StorageIndex blocks_m() const { return divup(m_, mc_); } - EIGEN_ALWAYS_INLINE StorageIndex blocks_k() const { return divup(k_, kc_); } - EIGEN_ALWAYS_INLINE StorageIndex blocks_n() const { return divup(n_, nc_); } - EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const { - return prefetch_; - } - - private: - StorageIndex k_, m_, n_; - StorageIndex kc_, mc_, nc_; - StorageIndex outer_k_, outer_m_, outer_n_; - bool copyA_, copyB_, transposeA_, transposeB_; - size_t num_threads_; - - // Threshold for m*k*n to skip blocking and just call libxsmm - const double LIBXSMM_THRESHOLD = 80*80*80; - // For computing optimal number of threads - so that each thread gets at least - // that many jobs. - const double MIN_JOBS_PER_THREAD = 3; - libxsmm_gemm_prefetch_type prefetch_; -}; -#endif // EIGEN_USE_LIBXSMM - } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 22db6f01b..ca20038a4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -78,23 +78,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT const Index k = this->m_k_size; if (m == 0 || n == 0 || k == 0) return; -#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) - if (this->m_can_use_xsmm) { - bool transposeA = !this->m_lhs_inner_dim_contiguous; - bool transposeB = !this->m_rhs_inner_dim_contiguous; - internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index> - blocking(k, m, n, this->m_device.numThreads(), transposeA, - transposeB); - - if (blocking.num_threads() == 1) { - this->evalGemmXSMM(buffer); - } else { - ContextXsmm<Alignment>(this, buffer, m, n, k, blocking).run(); - } - return; - } -#endif - // Compute a set of algorithm parameters: // - kernel block sizes (bm, bn, bk) // - task grain sizes (number of kernels executed per task: gm, gn) @@ -1227,190 +1210,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT return computeBandwidth; } -#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) - // TODO(ezhulenev): Add support for output kernels and LIBXSMM. - static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value, - "XSMM does not support contraction output kernels."); - - template<int Alignment> - class ContextXsmm { - public: - ContextXsmm(const Self* self, Scalar* buffer, Index m, Index n, Index k, - const internal::TensorXsmmContractionBlocking<LhsScalar, - RhsScalar, Index>& blocking): - device(self->m_device), - m(m), k(k), n(n), - stride_a(blocking.transposeA() ? k : m), - stride_b(blocking.transposeB() ? n : k), - stride_c(m), - bm(blocking.mc()), bk(blocking.kc()), bn(blocking.nc()), - blocks_m(blocking.blocks_m()), blocks_k(blocking.blocks_k()), - blocks_n(blocking.blocks_n()), - copyA(blocking.copyA()), copyB(blocking.copyB()), - transposeA(blocking.transposeA()), transposeB(blocking.transposeB()), - num_threads(blocking.num_threads()), - buffer(buffer), - leftData(self->m_leftImpl.data()), rightData(self->m_rightImpl.data()), - workers_done(blocking.num_threads()), - - packingA_jobs(0), packingB_jobs(0), compute_jobs(0), - packingA_done(blocking.blocks_m()), packingB_done(blocking.blocks_n()) {} - - void worker() { - // Pack - - if (copyA) { - while (true) { - uint32_t mk = packingA_jobs++; - Index mi = mk / blocks_k; - Index ki = mk % blocks_k; - if (mi >= blocks_m) break; - - LhsScalar * blockA = blocksA + (bk*bm) * (mi*blocks_k+ki); - if (transposeA) { - const LhsScalar * current_a = leftData + (bm*mi)*stride_a + (bk*ki); - libxsmm_otrans(blockA, current_a, sizeof(LhsScalar), actual_bk(ki), - actual_bm(mi), stride_a, bm); - } else { - const LhsScalar * current_a = leftData + (bk*ki)*stride_a + (bm*mi); - internal::pack_simple<LhsScalar, Index>(blockA, current_a, - actual_bk(ki), actual_bm(mi), bm, stride_a); - } - packingA_done.at(mi)++; - } - } - - if (copyB) { - while (true) { - uint32_t nk = packingB_jobs++; - Index ni = nk / blocks_k; - Index ki = nk % blocks_k; - if (ni >= blocks_n) break; - - RhsScalar * blockB = blocksB + (bk*bn) * (ni*blocks_k+ki); - if (transposeB) { - const RhsScalar * current_b = rightData + (ki*bk)*stride_b + - (ni*bn); - libxsmm_otrans(blockB, current_b, sizeof(RhsScalar), actual_bn(ni), - actual_bk(ki), stride_b, bk); - } else { - const RhsScalar * current_b = rightData + (ni*bn)*stride_b + - (ki*bk); - internal::pack_simple<RhsScalar, Index>(blockB, current_b, - actual_bn(ni), actual_bk(ki), bk, stride_b); - } - packingB_done.at(ni)++; - } - } - - // Compute - - while (true) { - uint32_t mn = compute_jobs++; - Index mi = mn / blocks_n; - Index ni = mn % blocks_n; - if (mi >= blocks_m) break; - - // Wait for mi, ni packings to be done. This is more fine-grained than - // waiting for all workers to finish packing. - while ((copyA && (packingA_done.at(mi) < blocks_k)) || - (copyB && (packingB_done.at(ni) < blocks_k))) - {} - - for (Index ki=0; ki < blocks_k; ++ki) { - const LhsScalar * current_a = copyA ? - blocksA + (bk*bm) * (mi*blocks_k+ki) : - leftData + (bk*ki)*stride_a + (bm*mi); - const RhsScalar * current_b = copyB ? - blocksB + (bk*bn) * (ni*blocks_k+ki) : - rightData + (ni*bn)*stride_b + (bk*ki); - - Index current_stride_a = copyA ? bm : stride_a; - Index current_stride_b = copyB ? bk : stride_b; - - // Memory may not be zeroed, overwrite instead of adding in first - // iteration. - float beta = ki == 0 ? 0 : 1; - - Scalar * current_c = buffer + (mi*bm) + (ni*bn)*stride_c; - internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>( - 0, actual_bm(mi), actual_bn(ni), actual_bk(ki), - current_stride_a, current_stride_b, stride_c, 1, beta, 0) - (current_a, current_b, current_c); - } - } - - workers_done.Notify(); - } - - void run() { - // Parallelization strategy. - // - // First pack A into blocks (sharding by m, k) and B (sharding by n,k), - // then shard by m, n. - // - // Do not use advanced ThreadPool queuing, just run a single long-standing - // function in each thread. - if (copyA) { - blocksA = static_cast<LhsScalar*>(device.allocate( - (blocks_m*bm)*(blocks_k*bk)*sizeof(LhsScalar))); - } - if (copyB) { - blocksB = static_cast<RhsScalar*>(device.allocate( - (blocks_n*bn)*(blocks_k*bk)*sizeof(RhsScalar))); - } - - for (Index i = 0; i < num_threads; ++i) { - device.enqueueNoNotification([=]() { worker(); }); - } - - workers_done.Wait(); - - if (copyA) { - device.deallocate(blocksA); - } - if (copyB) { - device.deallocate(blocksB); - } - } - - private: - // real block size for block index in [0, ..., blocks - 1]. - Index actual_bm(Index mi) const { - return mi != blocks_m - 1 ? bm : m + bm - bm * blocks_m; - } - Index actual_bk(Index ki) const { - return ki != blocks_k - 1 ? bk : k + bk - bk * blocks_k; - } - Index actual_bn(Index ni) const { - return ni != blocks_n - 1 ? bn : n + bn - bn * blocks_n; - } - - const Device& device; - Index m, k, n; - Index stride_a, stride_b, stride_c; - Index bm, bk, bn; // Block sizes. - Index blocks_m, blocks_k, blocks_n; // Number of blocks in each dimension. - bool copyA, copyB, transposeA, transposeB; - Index num_threads; - Scalar *buffer; - const LhsScalar *leftData; - const RhsScalar *rightData; - - LhsScalar *blocksA; - RhsScalar *blocksB; - // barrier for joining all threads after all done. - Barrier workers_done; - // "queues" of (mi,ki), (ki,ni), (mi,ni) jobs packed [0,p)x[0,q) -> [0, p*q) - std::atomic<uint32_t> packingA_jobs; - std::atomic<uint32_t> packingB_jobs; - std::atomic<uint32_t> compute_jobs; - // already packed blocks for each mi-panel in A and ni-panel in B. - std::vector<std::atomic<uint8_t>> packingA_done; - std::vector<std::atomic<uint8_t>> packingB_done; - }; -#endif - }; } // end namespace Eigen diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index e6c757275..42a450a85 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -12,17 +12,6 @@ include_directories(../../test ../../unsupported ../../Eigen find_package (Threads) -find_package(Xsmm) -if(XSMM_FOUND) - add_definitions("-DEIGEN_USE_LIBXSMM") - include_directories(${XSMM_INCLUDES}) - link_directories(${XSMM_LIBRARIES}) - set(EXTERNAL_LIBS ${EXTERNAL_LIBS} xsmm) - ei_add_property(EIGEN_TESTED_BACKENDS "Xsmm, ") -else(XSMM_FOUND) - ei_add_property(EIGEN_MISSING_BACKENDS "Xsmm, ") -endif(XSMM_FOUND) - find_package(GoogleHash) if(GOOGLEHASH_FOUND) add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT") diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index 75f2e1edf..2fd128121 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -511,8 +511,6 @@ static void test_const_inputs() VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1)); } -#if !defined(EIGEN_USE_LIBXSMM) - // Apply Sqrt to all output elements. struct SqrtOutputKernel { template <typename Index, typename Scalar> @@ -562,9 +560,6 @@ static void test_large_contraction_with_output_kernel() { } } -#endif // !defined(EIGEN_USE_LIBXSMM) - - EIGEN_DECLARE_TEST(cxx11_tensor_contraction) { CALL_SUBTEST(test_evals<ColMajor>()); @@ -597,8 +592,6 @@ EIGEN_DECLARE_TEST(cxx11_tensor_contraction) CALL_SUBTEST(test_tensor_product<RowMajor>()); CALL_SUBTEST(test_const_inputs<ColMajor>()); CALL_SUBTEST(test_const_inputs<RowMajor>()); -#if !defined(EIGEN_USE_LIBXSMM) CALL_SUBTEST(test_large_contraction_with_output_kernel<ColMajor>()); CALL_SUBTEST(test_large_contraction_with_output_kernel<RowMajor>()); -#endif } |