From 071311821e509d87bec609d6a3aeea9dc74cfd66 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 19 Aug 2019 11:44:25 -0700 Subject: Remove XSMM support from Tensor module --- .../CXX11/src/Tensor/TensorContractionBlocking.h | 135 --------------------- 1 file changed, 135 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index c51f3f8dd..974feb0ad 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -67,141 +67,6 @@ class TensorContractionBlocking { StorageIndex nc_; }; - - -#if defined(EIGEN_USE_LIBXSMM) -template -class TensorXsmmContractionBlocking { - public: - TensorXsmmContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, - size_t max_num_threads = 1, bool transposeA = false, - bool transposeB = false): - k_(k), m_(m), n_(n), transposeA_(transposeA), - transposeB_(transposeB), num_threads_(max_num_threads) { -#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES - if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) { - mc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M; - kc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K; - nc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N; - outer_m_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_M; - outer_k_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_K; - outer_n_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_N; - copyA_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_A; - copyB_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_B; - outer_m_ = outer_m_ != 0 ? outer_m_ : m; - outer_k_ = outer_k_ != 0 ? outer_k_ : k; - outer_n_ = outer_n_ != 0 ? outer_n_ : n; - } -#else - // Defaults, possibly overridden per-platform. - copyA_ = true; - copyB_ = false; - - // If the matrix is small enough, don't do blocking, just call single xsmm - // kernel. - if (static_cast(m)*k*n <= LIBXSMM_THRESHOLD) { - mc_ = m; kc_ = k; nc_ = n; - outer_m_ = m; outer_k_ = k; outer_n_ = n; - copyA_ = false; copyB_ = false; - } else { - int arch = libxsmm_cpuid_x86(); - - if (arch == LIBXSMM_X86_AVX512_CORE) { - // skylake - mc_ = 64; kc_ = 64; nc_ = 24; - outer_m_ = 512; outer_k_ = 512; outer_n_ = 24*22; - // Hack to use this kernel architecture as the other one has performance - // issues (no hardware prefetching). - // TODO(nishantpatil): This should be removed if the issues are fixed, - // or this one becomes the default. - setenv("LIBXSMM_AVX512_CLASSIC_GEMM", "1", 1); - } else if (arch == LIBXSMM_X86_AVX2) { - // haswell - mc_ = 32; kc_ = 192; nc_ = 33; - outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 33*16; - } else if (arch == LIBXSMM_X86_AVX) { - // ivybridge - mc_ = 32; kc_ = 192; nc_ = 48; - outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 48*11; - } else { - // generic kernel size, usually performing well - mc_ = 32; kc_ = 128; nc_ = 32; - outer_m_ = 512; outer_k_ = 512; outer_n_ = 512; - } - - // Only copy if it makes the stride smaller. - copyA_ = copyA_ && (m > mc_); - copyB_ = copyB_ && (k > kc_); - } - - // We need to copy anyway if transposing - copyA_ = copyA_ || transposeA; - copyB_ = copyB_ || transposeB; - - // See libxsmm_gemm_prefetch_type definition in libxsmm_typedefs.h - prefetch_ = LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C; - -#endif - - mc_ = mc_ > m ? m : mc_; - nc_ = nc_ > n ? n : nc_; - kc_ = kc_ > k ? k : kc_; - - size_t compute_parallelism = (m / mc_) * (n / nc_); - size_t pack_parallelism = 0; - if (copyA_) { - pack_parallelism += (m / mc_) * (k / kc_); - } - if (copyB_) { - pack_parallelism += (n / nc_) * (k / kc_); - } - size_t parallelism = numext::maxi(compute_parallelism, pack_parallelism); - - num_threads_ = numext::mini(num_threads_, - parallelism / MIN_JOBS_PER_THREAD); - num_threads_ = numext::maxi(num_threads_, 1); - - // For optimal performance outer block sizes should be multiplies of kernel - // sizes, or bigger than matrix size (=no outer blocking). - eigen_assert(outer_m_ % mc_ == 0 || outer_m_ >= m); - eigen_assert(outer_k_ % kc_ == 0 || outer_k_ >= k); - eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n); - } - - EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; } - EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; } - EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; } - EIGEN_ALWAYS_INLINE StorageIndex outer_k() const { return outer_k_; } - EIGEN_ALWAYS_INLINE StorageIndex outer_m() const { return outer_m_; } - EIGEN_ALWAYS_INLINE StorageIndex outer_n() const { return outer_n_; } - EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; } - EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; } - EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; } - EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; } - EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; } - EIGEN_ALWAYS_INLINE StorageIndex blocks_m() const { return divup(m_, mc_); } - EIGEN_ALWAYS_INLINE StorageIndex blocks_k() const { return divup(k_, kc_); } - EIGEN_ALWAYS_INLINE StorageIndex blocks_n() const { return divup(n_, nc_); } - EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const { - return prefetch_; - } - - private: - StorageIndex k_, m_, n_; - StorageIndex kc_, mc_, nc_; - StorageIndex outer_k_, outer_m_, outer_n_; - bool copyA_, copyB_, transposeA_, transposeB_; - size_t num_threads_; - - // Threshold for m*k*n to skip blocking and just call libxsmm - const double LIBXSMM_THRESHOLD = 80*80*80; - // For computing optimal number of threads - so that each thread gets at least - // that many jobs. - const double MIN_JOBS_PER_THREAD = 3; - libxsmm_gemm_prefetch_type prefetch_; -}; -#endif // EIGEN_USE_LIBXSMM - } // end namespace internal } // end namespace Eigen -- cgit v1.2.3