From f9eff17e915e270e654287723cea67be495f5c5f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 21 Dec 2016 12:32:06 -0800 Subject: Leverage libxsmm kernels within signle threaded contractions --- .../CXX11/src/Tensor/TensorContractionBlocking.h | 134 +++++++++++++++++++++ 1 file changed, 134 insertions(+) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index 5cf7b4f71..d34f9caee 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -50,6 +50,140 @@ class TensorContractionBlocking { }; + +#if defined(EIGEN_USE_LIBXSMM) +template +class TensorXsmmContractionBlocking { + public: + TensorXsmmContractionBlocking(Index k, Index m, Index n, + size_t max_num_threads = 1, bool transposeA = false, + bool transposeB = false): + k_(k), m_(m), n_(n), transposeA_(transposeA), + transposeB_(transposeB), num_threads_(max_num_threads) { +#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES + if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) { + mc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M; + kc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K; + nc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N; + outer_m_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_M; + outer_k_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_K; + outer_n_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_N; + copyA_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_A; + copyB_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_B; + outer_m_ = outer_m_ != 0 ? outer_m_ : m; + outer_k_ = outer_k_ != 0 ? outer_k_ : k; + outer_n_ = outer_n_ != 0 ? outer_n_ : n; + } +#else + // Defaults, possibly overriden per-platform. + copyA_ = true; + copyB_ = false; + + // If the matrix is small enough, don't do blocking, just call single xsmm + // kernel. + if (static_cast(m)*k*n <= LIBXSMM_THRESHOLD) { + mc_ = m; kc_ = k; nc_ = n; + outer_m_ = m; outer_k_ = k; outer_n_ = n; + copyA_ = false; copyB_ = false; + } else { + int arch = libxsmm_cpuid_x86(); + + if (arch == LIBXSMM_X86_AVX512_CORE) { + // skylake + mc_ = 64; kc_ = 64; nc_ = 24; + outer_m_ = 512; outer_k_ = 512; outer_n_ = 24*22; + // Hack to use this kernel architecture as the other one has performance + // issues (no hardware prefetching). + // TODO(nishantpatil): This should be removed if the issues are fixed, + // or this one becomes the default. + setenv("LIBXSMM_AVX512_CLASSIC_GEMM", "1", 1); + } else if (arch == LIBXSMM_X86_AVX2) { + // haswell + mc_ = 32; kc_ = 192; nc_ = 33; + outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 33*16; + } else if (arch == LIBXSMM_X86_AVX) { + // ivybridge + mc_ = 32; kc_ = 192; nc_ = 48; + outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 48*11; + } else { + // generic kernel size, usually performing well + mc_ = 32; kc_ = 128; nc_ = 32; + outer_m_ = 512; outer_k_ = 512; outer_n_ = 512; + } + + // Only copy if it makes the stride smaller. + copyA_ = copyA_ && (m > mc_); + copyB_ = copyB_ && (k > kc_); + } + + // We need to copy anyway if transposing + copyA_ = copyA_ || transposeA; + copyB_ = copyB_ || transposeB; + + // See libxsmm_gemm_prefetch_type definition in libxsmm_typedefs.h + prefetch_ = LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C; + +#endif + + mc_ = mc_ > m ? m : mc_; + nc_ = nc_ > n ? n : nc_; + kc_ = kc_ > k ? k : kc_; + + size_t compute_parallelism = (m / mc_) * (n / nc_); + size_t pack_parallelism = 0; + if (copyA_) { + pack_parallelism += (m / mc_) * (k / kc_); + } + if (copyB_) { + pack_parallelism += (n / nc_) * (k / kc_); + } + size_t parallelism = numext::maxi(compute_parallelism, pack_parallelism); + + num_threads_ = numext::mini(num_threads_, + parallelism / MIN_JOBS_PER_THREAD); + num_threads_ = numext::maxi(num_threads_, 1); + + // For optimal performance outer block sizes should be multiplies of kernel + // sizes, or bigger than matrix size (=no outer blocking). + eigen_assert(outer_m_ % mc_ == 0 || outer_m_ >= m); + eigen_assert(outer_k_ % kc_ == 0 || outer_k_ >= k); + eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n); + } + + EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } + EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } + EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } + EIGEN_ALWAYS_INLINE Index outer_k() const { return outer_k_; } + EIGEN_ALWAYS_INLINE Index outer_m() const { return outer_m_; } + EIGEN_ALWAYS_INLINE Index outer_n() const { return outer_n_; } + EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; } + EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; } + EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; } + EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; } + EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; } + EIGEN_ALWAYS_INLINE Index blocks_m() const { return divup(m_, mc_); } + EIGEN_ALWAYS_INLINE Index blocks_k() const { return divup(k_, kc_); } + EIGEN_ALWAYS_INLINE Index blocks_n() const { return divup(n_, nc_); } + EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const { + return prefetch_; + } + + private: + Index k_, m_, n_; + Index kc_, mc_, nc_; + Index outer_k_, outer_m_, outer_n_; + bool copyA_, copyB_, transposeA_, transposeB_; + size_t num_threads_; + + // Threshold for m*k*n to skip blocking and just call libxsmm + const double LIBXSMM_THRESHOLD = 80*80*80; + // For computing optimal number of threads - so that each thread gets at least + // that many jobs. + const double MIN_JOBS_PER_THREAD = 3; + libxsmm_gemm_prefetch_type prefetch_; +}; +#endif // EIGEN_USE_LIBXSMM + } // end namespace internal } // end namespace Eigen -- cgit v1.2.3