// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2014 Benoit Steiner // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H namespace Eigen { namespace internal { enum { ShardByRow = 0, ShardByCol = 1 }; // Default Blocking Strategy template class TensorContractionBlocking { public: typedef typename LhsMapper::Scalar LhsScalar; typedef typename RhsMapper::Scalar RhsScalar; EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : kc_(k), mc_(m), nc_(n) { if (ShardingType == ShardByCol) { computeProductBlockingSizes(kc_, mc_, nc_, num_threads); } else { computeProductBlockingSizes(kc_, nc_, mc_, num_threads); } } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } private: Index kc_; Index mc_; Index nc_; }; #if defined(EIGEN_USE_LIBXSMM) template class TensorXsmmContractionBlocking { public: TensorXsmmContractionBlocking(Index k, Index m, Index n, size_t max_num_threads = 1, bool transposeA = false, bool transposeB = false): k_(k), m_(m), n_(n), transposeA_(transposeA), transposeB_(transposeB), num_threads_(max_num_threads) { #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) { mc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M; kc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K; nc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N; outer_m_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_M; outer_k_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_K; outer_n_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_N; copyA_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_A; copyB_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_B; outer_m_ = outer_m_ != 0 ? outer_m_ : m; outer_k_ = outer_k_ != 0 ? outer_k_ : k; outer_n_ = outer_n_ != 0 ? outer_n_ : n; } #else // Defaults, possibly overriden per-platform. copyA_ = true; copyB_ = false; // If the matrix is small enough, don't do blocking, just call single xsmm // kernel. if (static_cast(m)*k*n <= LIBXSMM_THRESHOLD) { mc_ = m; kc_ = k; nc_ = n; outer_m_ = m; outer_k_ = k; outer_n_ = n; copyA_ = false; copyB_ = false; } else { int arch = libxsmm_cpuid_x86(); if (arch == LIBXSMM_X86_AVX512_CORE) { // skylake mc_ = 64; kc_ = 64; nc_ = 24; outer_m_ = 512; outer_k_ = 512; outer_n_ = 24*22; // Hack to use this kernel architecture as the other one has performance // issues (no hardware prefetching). // TODO(nishantpatil): This should be removed if the issues are fixed, // or this one becomes the default. setenv("LIBXSMM_AVX512_CLASSIC_GEMM", "1", 1); } else if (arch == LIBXSMM_X86_AVX2) { // haswell mc_ = 32; kc_ = 192; nc_ = 33; outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 33*16; } else if (arch == LIBXSMM_X86_AVX) { // ivybridge mc_ = 32; kc_ = 192; nc_ = 48; outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 48*11; } else { // generic kernel size, usually performing well mc_ = 32; kc_ = 128; nc_ = 32; outer_m_ = 512; outer_k_ = 512; outer_n_ = 512; } // Only copy if it makes the stride smaller. copyA_ = copyA_ && (m > mc_); copyB_ = copyB_ && (k > kc_); } // We need to copy anyway if transposing copyA_ = copyA_ || transposeA; copyB_ = copyB_ || transposeB; // See libxsmm_gemm_prefetch_type definition in libxsmm_typedefs.h prefetch_ = LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C; #endif mc_ = mc_ > m ? m : mc_; nc_ = nc_ > n ? n : nc_; kc_ = kc_ > k ? k : kc_; size_t compute_parallelism = (m / mc_) * (n / nc_); size_t pack_parallelism = 0; if (copyA_) { pack_parallelism += (m / mc_) * (k / kc_); } if (copyB_) { pack_parallelism += (n / nc_) * (k / kc_); } size_t parallelism = numext::maxi(compute_parallelism, pack_parallelism); num_threads_ = numext::mini(num_threads_, parallelism / MIN_JOBS_PER_THREAD); num_threads_ = numext::maxi(num_threads_, 1); // For optimal performance outer block sizes should be multiplies of kernel // sizes, or bigger than matrix size (=no outer blocking). eigen_assert(outer_m_ % mc_ == 0 || outer_m_ >= m); eigen_assert(outer_k_ % kc_ == 0 || outer_k_ >= k); eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n); } EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } EIGEN_ALWAYS_INLINE Index outer_k() const { return outer_k_; } EIGEN_ALWAYS_INLINE Index outer_m() const { return outer_m_; } EIGEN_ALWAYS_INLINE Index outer_n() const { return outer_n_; } EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; } EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; } EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; } EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; } EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; } EIGEN_ALWAYS_INLINE Index blocks_m() const { return divup(m_, mc_); } EIGEN_ALWAYS_INLINE Index blocks_k() const { return divup(k_, kc_); } EIGEN_ALWAYS_INLINE Index blocks_n() const { return divup(n_, nc_); } EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const { return prefetch_; } private: Index k_, m_, n_; Index kc_, mc_, nc_; Index outer_k_, outer_m_, outer_n_; bool copyA_, copyB_, transposeA_, transposeB_; size_t num_threads_; // Threshold for m*k*n to skip blocking and just call libxsmm const double LIBXSMM_THRESHOLD = 80*80*80; // For computing optimal number of threads - so that each thread gets at least // that many jobs. const double MIN_JOBS_PER_THREAD = 3; libxsmm_gemm_prefetch_type prefetch_; }; #endif // EIGEN_USE_LIBXSMM } // end namespace internal } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H