From f9eff17e915e270e654287723cea67be495f5c5f Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 21 Dec 2016 12:32:06 -0800
Subject: Leverage libxsmm kernels within signle threaded contractions

---
 .../CXX11/src/Tensor/TensorContractionBlocking.h   | 134 +++++++++++++++++++++
 1 file changed, 134 insertions(+)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index 5cf7b4f71..d34f9caee 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -50,6 +50,140 @@ class TensorContractionBlocking {
 };
 
 
+
+#if defined(EIGEN_USE_LIBXSMM)
+template <typename LhsScalar, typename RhsScalar, typename Index>
+class TensorXsmmContractionBlocking {
+ public:
+  TensorXsmmContractionBlocking(Index k, Index m, Index n,
+      size_t max_num_threads = 1, bool transposeA = false,
+      bool transposeB = false):
+    k_(k), m_(m), n_(n), transposeA_(transposeA),
+    transposeB_(transposeB), num_threads_(max_num_threads) {
+#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+    if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
+      mc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M;
+      kc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K;
+      nc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N;
+      outer_m_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_M;
+      outer_k_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_K;
+      outer_n_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_N;
+      copyA_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_A;
+      copyB_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_B;
+      outer_m_ = outer_m_ != 0 ? outer_m_ : m;
+      outer_k_ = outer_k_ != 0 ? outer_k_ : k;
+      outer_n_ = outer_n_ != 0 ? outer_n_ : n;
+    }
+#else
+    // Defaults, possibly overriden per-platform.
+    copyA_ = true;
+    copyB_ = false;
+
+    // If the matrix is small enough, don't do blocking, just call single xsmm
+    // kernel.
+    if (static_cast<double>(m)*k*n <= LIBXSMM_THRESHOLD) {
+      mc_ = m; kc_ = k; nc_ = n;
+      outer_m_ = m; outer_k_ = k; outer_n_ = n;
+      copyA_ = false; copyB_ = false;
+    } else {
+      int arch = libxsmm_cpuid_x86();
+
+      if (arch == LIBXSMM_X86_AVX512_CORE) {
+        // skylake
+        mc_ = 64; kc_ = 64; nc_ = 24;
+        outer_m_ = 512; outer_k_ = 512; outer_n_ = 24*22;
+        // Hack to use this kernel architecture as the other one has performance
+        // issues (no hardware prefetching).
+        // TODO(nishantpatil): This should be removed if the issues are fixed,
+        // or this one becomes the default.
+        setenv("LIBXSMM_AVX512_CLASSIC_GEMM", "1", 1);
+      } else if (arch == LIBXSMM_X86_AVX2) {
+        // haswell
+        mc_ = 32; kc_ = 192; nc_ = 33;
+        outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 33*16;
+      } else if (arch == LIBXSMM_X86_AVX) {
+        // ivybridge
+        mc_ = 32; kc_ = 192; nc_ = 48;
+        outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 48*11;
+      } else {
+        // generic kernel size, usually performing well
+        mc_ = 32; kc_ = 128; nc_ = 32;
+        outer_m_ = 512; outer_k_ = 512; outer_n_ = 512;
+      }
+
+      // Only copy if it makes the stride smaller.
+      copyA_ = copyA_ && (m > mc_);
+      copyB_ = copyB_ && (k > kc_);
+    }
+
+    // We need to copy anyway if transposing
+    copyA_ = copyA_ || transposeA;
+    copyB_ = copyB_ || transposeB;
+
+    // See libxsmm_gemm_prefetch_type definition in libxsmm_typedefs.h
+    prefetch_ = LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C;
+
+#endif
+
+    mc_ = mc_ > m ? m : mc_;
+    nc_ = nc_ > n ? n : nc_;
+    kc_ = kc_ > k ? k : kc_;
+
+    size_t compute_parallelism = (m / mc_) * (n / nc_);
+    size_t pack_parallelism = 0;
+    if (copyA_) {
+      pack_parallelism += (m / mc_) * (k / kc_);
+    }
+    if (copyB_) {
+      pack_parallelism += (n / nc_) * (k / kc_);
+    }
+    size_t parallelism = numext::maxi(compute_parallelism, pack_parallelism);
+
+    num_threads_ = numext::mini<size_t>(num_threads_,
+                                    parallelism / MIN_JOBS_PER_THREAD);
+    num_threads_ = numext::maxi<size_t>(num_threads_, 1);
+
+    // For optimal performance outer block sizes should be multiplies of kernel
+    // sizes, or bigger than matrix size (=no outer blocking).
+    eigen_assert(outer_m_ % mc_ == 0 || outer_m_ >= m);
+    eigen_assert(outer_k_ % kc_ == 0 || outer_k_ >= k);
+    eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n);
+  }
+
+  EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
+  EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
+  EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
+  EIGEN_ALWAYS_INLINE Index outer_k() const { return outer_k_; }
+  EIGEN_ALWAYS_INLINE Index outer_m() const { return outer_m_; }
+  EIGEN_ALWAYS_INLINE Index outer_n() const { return outer_n_; }
+  EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; }
+  EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; }
+  EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; }
+  EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; }
+  EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; }
+  EIGEN_ALWAYS_INLINE Index blocks_m() const { return divup(m_, mc_); }
+  EIGEN_ALWAYS_INLINE Index blocks_k() const { return divup(k_, kc_); }
+  EIGEN_ALWAYS_INLINE Index blocks_n() const { return divup(n_, nc_); }
+  EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const {
+    return prefetch_;
+  }
+
+ private:
+  Index k_, m_, n_;
+  Index kc_, mc_, nc_;
+  Index outer_k_, outer_m_, outer_n_;
+  bool copyA_, copyB_, transposeA_, transposeB_;
+  size_t num_threads_;
+
+  // Threshold for m*k*n to skip blocking and just call libxsmm
+  const double LIBXSMM_THRESHOLD = 80*80*80;
+  // For computing optimal number of threads - so that each thread gets at least
+  // that many jobs.
+  const double MIN_JOBS_PER_THREAD = 3;
+  libxsmm_gemm_prefetch_type prefetch_;
+};
+#endif // EIGEN_USE_LIBXSMM
+
 } // end namespace internal
 } // end namespace Eigen
 
-- 
cgit v1.2.3