From f9eff17e915e270e654287723cea67be495f5c5f Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 21 Dec 2016 12:32:06 -0800
Subject: Leverage libxsmm kernels within signle threaded contractions

---
 .../Eigen/CXX11/src/Tensor/TensorContraction.h     | 291 ++++++++++++++++++++-
 1 file changed, 289 insertions(+), 2 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 2ac6abf69..c446ba1af 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -20,6 +20,70 @@ namespace Eigen {
   *
   */
 namespace internal {
+#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
+template<typename Scalar, typename Index>
+void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index lddst, Index ldsrc) {
+  size_t psize = packet_traits<Scalar>::size;           // Packet size
+  typedef typename packet_traits<Scalar>::type Packet;  // Packet type
+  size_t alignment = psize*sizeof(Scalar);              // Needed alignment
+  if (rows % psize == 0 && (lddst*sizeof(Scalar)) % alignment == 0 &&
+     (ldsrc*sizeof(Scalar)) % alignment == 0 &&
+     reinterpret_cast<uintptr_t>(src) % alignment == 0 &&
+     reinterpret_cast<uintptr_t>(dst) % alignment == 0) {
+    // Optimized version using packets
+    size_t num_packets = rows / psize;
+    for (Index col = 0; col < cols; ++col) {
+      EIGEN_ASM_COMMENT("begin pack_simple inner copy");
+      // Unrolled manually 4 times.
+      for (size_t i=0; i < num_packets/4; ++i) {
+        internal::pstore(dst, internal::pload<Packet>(src));
+        dst += psize; src += psize;
+        internal::pstore(dst, internal::pload<Packet>(src));
+        dst += psize; src += psize;
+        internal::pstore(dst, internal::pload<Packet>(src));
+        dst += psize; src += psize;
+        internal::pstore(dst, internal::pload<Packet>(src));
+        dst += psize; src += psize;
+      }
+      for (size_t i=0; i < num_packets%4; ++i) {
+        internal::pstore(dst, internal::pload<Packet>(src));
+        dst += psize; src += psize;
+      }
+      dst += lddst - num_packets*psize;
+      src += ldsrc - num_packets*psize;
+      EIGEN_ASM_COMMENT("end pack_simple inner copy");
+    }
+  } else {
+    // Naive memcpy calls
+    for (Index col = 0; col < cols; ++col) {
+      memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar));
+    }
+  }
+}
+
+template<typename LhsScalar, typename RhsScalar, typename Scalar>
+  struct libxsmm_wrapper {
+    libxsmm_wrapper() {}
+    libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) {}
+    void operator()(const LhsScalar* a, const RhsScalar* b, Scalar* c) {}
+    void operator()(const LhsScalar* a, const RhsScalar* b, Scalar* c, const LhsScalar* ap, const RhsScalar* bp, const Scalar* cp) {}
+  };
+
+  template<>
+  struct libxsmm_wrapper<float, float, float>: public libxsmm_mmfunction<float> {
+    libxsmm_wrapper(): libxsmm_mmfunction() {}
+    libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) :
+        libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {}
+  };
+
+  template<>
+  struct libxsmm_wrapper<double, double, double>: public libxsmm_mmfunction<double> {
+    libxsmm_wrapper(): libxsmm_mmfunction() {}
+    libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) :
+        libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {}
+  };
+#endif
+
 
 template<typename Dimensions, typename LhsXprType, typename RhsXprType>
 struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
@@ -317,6 +381,8 @@ struct TensorContractionEvaluatorBase
       }
     }
 
+    EnableXSMMIfPossible(eval_op_indices);
+
     // If the layout is RowMajor, we need to reverse the m_dimensions
     if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
       for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
@@ -422,6 +488,13 @@ struct TensorContractionEvaluatorBase
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
   EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
+    #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
+    if (m_can_use_xsmm) {
+      evalGemmXSMM(buffer);
+      return;
+    }
+    #endif
+
     // columns in left side, rows in right side
     const Index k = this->m_k_size;
 
@@ -538,7 +611,221 @@ struct TensorContractionEvaluatorBase
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; }
 
-  protected:
+protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void EnableXSMMIfPossible(const array<IndexPair<Index>, ContractDims>& eval_op_indices) {
+    m_can_use_xsmm = false;
+
+#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
+    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+    if (!std::is_same<Scalar, LhsScalar>::value ||
+        !std::is_same<Scalar, RhsScalar>::value ||
+        !(std::is_same<Scalar, float>::value ||
+          std::is_same<Scalar, double>::value) ||
+        m_leftImpl.data() == NULL ||
+        m_rightImpl.data() == NULL) {
+      return;
+    }
+
+    // Check if we can use faster matmul algorithms. For contraction to be
+    // equivalent to matmul, we need both lhs and rhs contracting dims sequences
+    // to be either a prefix or suffix of all dims. Also, the order of both
+    // must be the same, so we don't have to do reordering.
+    // For example:
+    // * OK: lhs 4D, rhs 4D, contraction: [(0, 2), (1, 3)]
+    // * BAD: lhs 3D, rhs 3D, contraction: [(1,1)]
+    // * BAD: lhs 3D, rhs 3D, contraction: [(0, 0), (2, 2)]
+    // * BAD: lhs 3D, rhs 3D, contraction: [(0, 2), (1, 1)]
+    // Depending if contraction dims are prefix or suffix of all dims we need to
+    // pre-transpose matrices in matmul algorithm:
+    // lhs: prefix -> transpose, suffix -> no transpose
+    // rhs: prefix -> no transpose, suffix -> transpose
+    // For example, for lhs 2D, rhs 2D, contraction [(1, 0)] is regular,
+    // non-transposed matmul.
+    if (ContractDims == 0) {
+      // This case is totally uninteresting, filter it out to avoid problems
+      // with iterations in further tests.
+      return;
+    }
+
+    // Check if RHS dims list is increasing. LHS already is, so if not, the
+    // order is different and we cannot do matmul.
+    for (int i = 1; i < ContractDims; i++) {
+      if (eval_op_indices[i].second < eval_op_indices[i-1].second) {
+        return;
+      }
+    }
+
+    // Check if no holes.
+    int diff;
+    for (int i = 1; i < ContractDims; i++) {
+      // LHS contract dims are sorted to form an increasing seq.
+      diff = eval_op_indices[i].first - eval_op_indices[i-1].first;
+      if (diff != 1) {
+        return;
+      }
+      // Now we may already assume RHS contract dims seq is increasing too.
+      diff = eval_op_indices[i].second - eval_op_indices[i-1].second;
+      if (diff != 1) {
+        return;
+      }
+    }
+
+    // Check if suffix or prefix.
+    if (eval_op_indices[0].first != 0 &&
+        eval_op_indices[ContractDims-1].first != LDims-1) {
+      return;
+    }
+    if (eval_op_indices[0].second != 0 &&
+        eval_op_indices[ContractDims-1].second != RDims-1) {
+      return;
+    }
+
+    m_can_use_xsmm = true;
+    #endif
+  }
+
+#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
+  EIGEN_DEVICE_FUNC void evalGemmXSMM(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    const bool transposeA = !m_lhs_inner_dim_contiguous;
+    const bool transposeB = !m_rhs_inner_dim_contiguous;
+
+    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+
+    internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index> blocking(
+        k, m, n, 1, transposeA, transposeB);
+
+    // Outer blocks sizes
+    const Index mc_outer = blocking.outer_m();
+    const Index nc_outer = blocking.outer_n();
+    const Index kc_outer = blocking.outer_k();
+    // Inner blocks sizes
+    const Index mc = blocking.mc();
+    const Index nc = blocking.nc();
+    const Index kc = blocking.kc();
+    // Decisions whether we should copy parts of matrices
+    const bool copyA = blocking.copyA();
+    const bool copyB = blocking.copyB();
+
+    const LhsScalar* leftData = m_leftImpl.data();
+    const RhsScalar* rightData = m_rightImpl.data();
+
+    libxsmm_blasint stride_A = static_cast<libxsmm_blasint>(transposeA ? k : m);
+    libxsmm_blasint stride_B = static_cast<libxsmm_blasint>(transposeB ? n : k);
+    libxsmm_blasint stride_C = static_cast<libxsmm_blasint>(m);
+
+    libxsmm_blasint stride_blockA = static_cast<libxsmm_blasint>(mc);
+    // Use bigger stride to avoid hitting same cache line too often.
+    // This consistently gives +~0.5 Gflops.
+    libxsmm_blasint stride_panelB = static_cast<libxsmm_blasint>(
+        kc % 32 == 0 ? kc + 16 : kc
+    );
+
+    // Kernel for the general case (not edges)
+    internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar> kernel;
+
+    const LhsScalar *ap;
+    const RhsScalar *bp;
+    const Scalar *cp;
+
+    LhsScalar* blockA = NULL;
+    RhsScalar* panelB = NULL;
+
+    if (copyA) {
+      blockA = static_cast<LhsScalar*>(this->m_device.allocate(mc * kc * sizeof(LhsScalar)));
+    }
+    if (copyB) {
+      panelB = static_cast<RhsScalar*>(this->m_device.allocate(nc_outer * stride_panelB * sizeof(RhsScalar)));
+    }
+
+    Index kernel_stride_A = copyA ? stride_blockA : stride_A;
+    Index kernel_stride_B = copyB ? stride_panelB : stride_B;
+    kernel = internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, mc, nc, kc, kernel_stride_A, kernel_stride_B, stride_C, 1, 1, blocking.prefetch());
+
+    // Outer blocking
+    for (Index ki_outer = 0; ki_outer < k; ki_outer += kc_outer) {
+      for (Index mi_outer = 0; mi_outer < m; mi_outer += mc_outer) {
+        for (Index ni_outer = 0; ni_outer < n; ni_outer += nc_outer) {
+          using numext::mini;
+
+          Index actual_nc_outer = mini(ni_outer+nc_outer, n) - ni_outer;
+
+          // Inner blocking
+          for (Index ki = ki_outer; ki < mini(ki_outer+kc_outer, k); ki += kc) {
+            const Index actual_kc = mini(ki_outer+kc_outer, mini(ki+kc, k)) - ki;
+
+            if (copyB) {
+              if (transposeB) {
+                libxsmm_otrans(panelB, rightData + ki*stride_B + ni_outer, sizeof(RhsScalar), actual_nc_outer, actual_kc, stride_B, stride_panelB);
+              } else {
+                internal::pack_simple<RhsScalar, Index>(panelB, rightData + ni_outer*stride_B + ki, actual_nc_outer, actual_kc, stride_panelB, stride_B);
+              }
+            }
+
+            for (Index mi = mi_outer; mi < mini(mi_outer+mc_outer, m); mi += mc) {
+              const Index actual_mc = mini(mi_outer+mc_outer, mini(mi+mc, m)) - mi;
+
+              const LhsScalar * a = transposeA ? leftData + mi*stride_A + ki :
+                                                 leftData + ki*stride_A + mi;
+
+              if (copyA) {
+                if (transposeA) {
+                  libxsmm_otrans(blockA, a, sizeof(LhsScalar), actual_kc, actual_mc, stride_A, stride_blockA);
+                } else {
+                  internal::pack_simple<LhsScalar, Index>(blockA, a, actual_kc, actual_mc, stride_blockA, stride_A);
+                }
+              }
+
+              for (Index ni = ni_outer; ni < mini(ni_outer+nc_outer, n); ni += nc) {
+                const Index actual_nc = mini(ni_outer+nc_outer, mini(ni+nc, n)) - ni;
+
+                const RhsScalar * b = rightData + ni*stride_B + ki;
+                Scalar * c = buffer + ni*stride_C + mi;
+                cp = c + nc*stride_C;
+
+                const LhsScalar * actual_a = copyA ? blockA : a;
+                const Index actual_lda = copyA ? stride_blockA : stride_A;
+                ap = copyA ? blockA : a;
+
+                const RhsScalar * actual_b = copyB ? panelB + (ni-ni_outer)*stride_panelB : b;
+                const Index actual_ldb = copyB ? stride_panelB : stride_B;
+                bp = copyB ? panelB + nc*stride_panelB : b + nc*stride_B;
+
+                float beta = ki == 0 ? 0 : 1;
+
+                if (actual_mc == mc && actual_kc == kc && actual_nc == nc && beta == 1) {
+                  // Most used, cached kernel.
+                  kernel(actual_a, actual_b, c, ap, bp, cp);
+                } else {
+                  // Edges - use libxsmm kernel cache.
+                  internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, actual_mc, actual_nc, actual_kc, actual_lda, actual_ldb, stride_C, 1, beta, blocking.prefetch())(actual_a, actual_b, c, ap, bp, cp);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    if (copyA) {
+      this->m_device.deallocate(blockA);
+    }
+    if (copyB) {
+      this->m_device.deallocate(panelB);
+    }
+  }
+#endif
+
   // Prevent assignment
   TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
   Dimensions m_dimensions;
@@ -567,6 +854,7 @@ struct TensorContractionEvaluatorBase
   /// required for sycl
   const Indices m_expr_indices;
 
+  bool m_can_use_xsmm;
 };
 
 
@@ -624,7 +912,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
 
     this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
   }
-
 };
 
 } // end namespace Eigen
-- 
cgit v1.2.3


From 4236aebe103b0fa54f3b9e7e3c0c12094fa6e200 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 21 Dec 2016 16:42:56 -0800
Subject: Simplified the contraction code`

---
 .../Eigen/CXX11/src/Tensor/TensorContraction.h     | 45 +++++++++-------------
 1 file changed, 18 insertions(+), 27 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index c446ba1af..442c14fac 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -720,24 +720,20 @@ protected:
     const LhsScalar* leftData = m_leftImpl.data();
     const RhsScalar* rightData = m_rightImpl.data();
 
-    libxsmm_blasint stride_A = static_cast<libxsmm_blasint>(transposeA ? k : m);
-    libxsmm_blasint stride_B = static_cast<libxsmm_blasint>(transposeB ? n : k);
-    libxsmm_blasint stride_C = static_cast<libxsmm_blasint>(m);
+    const libxsmm_blasint stride_A = static_cast<libxsmm_blasint>(transposeA ? k : m);
+    const libxsmm_blasint stride_B = static_cast<libxsmm_blasint>(transposeB ? n : k);
+    const libxsmm_blasint stride_C = static_cast<libxsmm_blasint>(m);
 
-    libxsmm_blasint stride_blockA = static_cast<libxsmm_blasint>(mc);
+    const libxsmm_blasint stride_blockA = static_cast<libxsmm_blasint>(mc);
     // Use bigger stride to avoid hitting same cache line too often.
     // This consistently gives +~0.5 Gflops.
-    libxsmm_blasint stride_panelB = static_cast<libxsmm_blasint>(
+    const libxsmm_blasint stride_panelB = static_cast<libxsmm_blasint>(
         kc % 32 == 0 ? kc + 16 : kc
     );
 
     // Kernel for the general case (not edges)
     internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar> kernel;
 
-    const LhsScalar *ap;
-    const RhsScalar *bp;
-    const Scalar *cp;
-
     LhsScalar* blockA = NULL;
     RhsScalar* panelB = NULL;
 
@@ -748,8 +744,8 @@ protected:
       panelB = static_cast<RhsScalar*>(this->m_device.allocate(nc_outer * stride_panelB * sizeof(RhsScalar)));
     }
 
-    Index kernel_stride_A = copyA ? stride_blockA : stride_A;
-    Index kernel_stride_B = copyB ? stride_panelB : stride_B;
+    const Index kernel_stride_A = copyA ? stride_blockA : stride_A;
+    const Index kernel_stride_B = copyB ? stride_panelB : stride_B;
     kernel = internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, mc, nc, kc, kernel_stride_A, kernel_stride_B, stride_C, 1, 1, blocking.prefetch());
 
     // Outer blocking
@@ -763,6 +759,7 @@ protected:
           // Inner blocking
           for (Index ki = ki_outer; ki < mini(ki_outer+kc_outer, k); ki += kc) {
             const Index actual_kc = mini(ki_outer+kc_outer, mini(ki+kc, k)) - ki;
+            const float beta = ki == 0 ? 0 : 1;
 
             if (copyB) {
               if (transposeB) {
@@ -775,8 +772,8 @@ protected:
             for (Index mi = mi_outer; mi < mini(mi_outer+mc_outer, m); mi += mc) {
               const Index actual_mc = mini(mi_outer+mc_outer, mini(mi+mc, m)) - mi;
 
-              const LhsScalar * a = transposeA ? leftData + mi*stride_A + ki :
-                                                 leftData + ki*stride_A + mi;
+              const LhsScalar* a = transposeA ? leftData + mi*stride_A + ki :
+                                                leftData + ki*stride_A + mi;
 
               if (copyA) {
                 if (transposeA) {
@@ -785,30 +782,24 @@ protected:
                   internal::pack_simple<LhsScalar, Index>(blockA, a, actual_kc, actual_mc, stride_blockA, stride_A);
                 }
               }
+              const LhsScalar* actual_a = copyA ? blockA : a;
 
               for (Index ni = ni_outer; ni < mini(ni_outer+nc_outer, n); ni += nc) {
                 const Index actual_nc = mini(ni_outer+nc_outer, mini(ni+nc, n)) - ni;
 
-                const RhsScalar * b = rightData + ni*stride_B + ki;
-                Scalar * c = buffer + ni*stride_C + mi;
-                cp = c + nc*stride_C;
-
-                const LhsScalar * actual_a = copyA ? blockA : a;
-                const Index actual_lda = copyA ? stride_blockA : stride_A;
-                ap = copyA ? blockA : a;
-
-                const RhsScalar * actual_b = copyB ? panelB + (ni-ni_outer)*stride_panelB : b;
-                const Index actual_ldb = copyB ? stride_panelB : stride_B;
-                bp = copyB ? panelB + nc*stride_panelB : b + nc*stride_B;
+                const RhsScalar* b = rightData + ni*stride_B + ki;
+                Scalar* c = buffer + ni*stride_C + mi;
+                const Scalar* cp = c + nc*stride_C;
 
-                float beta = ki == 0 ? 0 : 1;
+                const RhsScalar* actual_b = copyB ? panelB + (ni-ni_outer)*stride_panelB : b;
+                const RhsScalar* bp = copyB ? panelB + nc*stride_panelB : b + nc*stride_B;
 
                 if (actual_mc == mc && actual_kc == kc && actual_nc == nc && beta == 1) {
                   // Most used, cached kernel.
-                  kernel(actual_a, actual_b, c, ap, bp, cp);
+                  kernel(actual_a, actual_b, c, actual_a, bp, cp);
                 } else {
                   // Edges - use libxsmm kernel cache.
-                  internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, actual_mc, actual_nc, actual_kc, actual_lda, actual_ldb, stride_C, 1, beta, blocking.prefetch())(actual_a, actual_b, c, ap, bp, cp);
+                  internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, actual_mc, actual_nc, actual_kc, kernel_stride_A, kernel_stride_B, stride_C, 1, beta, blocking.prefetch())(actual_a, actual_b, c, actual_a, bp, cp);
                 }
               }
             }
-- 
cgit v1.2.3


From e6b10202218631be755f19c41fe01287b9a37f90 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Tue, 24 Jan 2017 13:55:18 -0800
Subject: Adds a fast memcpy function to Eigen. This takes advantage of the
 following:

1. For small fixed sizes, the compiler generates inline code for memcpy, which is much faster.

2. My colleague eriche at googl dot com discovered that for large sizes, memmove is significantly faster than memcpy (at least on Linux with GCC or Clang). See benchmark numbers measured on a Haswell (HP Z440) workstation here: https://docs.google.com/a/google.com/spreadsheets/d/1jLs5bKzXwhpTySw65MhG1pZpsIwkszZqQTjwrd_n0ic/pubhtml This is of course surprising since memcpy is a less constrained version of memmove. This stackoverflow thread contains some speculation as to the causes: http://stackoverflow.com/questions/22793669/poor-memcpy-performance-on-linux

Below are numbers for copying and slicing tensors using the multithreaded TensorDevice. The numbers show significant improvements for memcpy of very small blocks and for memcpy of large blocks single threaded (we were already able to saturate memory bandwidth for >1 threads before on large blocks). The "slicingSmallPieces" benchmark also shows small consistent improvements, since memcpy cost is a fair portion of that particular computation.

The benchmarks operate on NxN matrices, and the names are of the form BM_$OP_${NUMTHREADS}T/${N}.

Measured improvements in wall clock time:

Run on rmlarsen3.mtv (12 X 3501 MHz CPUs); 2017-01-20T11:26:31.493023454-08:00
CPU: Intel Haswell with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:15MB
Benchmark                          Base (ns)  New (ns) Improvement
------------------------------------------------------------------
BM_memcpy_1T/2                          3.48      2.39    +31.3%
BM_memcpy_1T/8                          12.3      6.51    +47.0%
BM_memcpy_1T/64                          371       383     -3.2%
BM_memcpy_1T/512                       66922     66720     +0.3%
BM_memcpy_1T/4k                      9892867   6849682    +30.8%
BM_memcpy_1T/5k                     14951099  10332856    +30.9%
BM_memcpy_2T/2                          3.50      2.46    +29.7%
BM_memcpy_2T/8                          12.3      7.66    +37.7%
BM_memcpy_2T/64                          371       376     -1.3%
BM_memcpy_2T/512                       66652     66788     -0.2%
BM_memcpy_2T/4k                      6145012   6117776     +0.4%
BM_memcpy_2T/5k                      9181478   9010942     +1.9%
BM_memcpy_4T/2                          3.47      2.47    +31.0%
BM_memcpy_4T/8                          12.3      6.67    +45.8
BM_memcpy_4T/64                          374       376     -0.5%
BM_memcpy_4T/512                       67833     68019     -0.3%
BM_memcpy_4T/4k                      5057425   5188253     -2.6%
BM_memcpy_4T/5k                      7555638   7779468     -3.0%
BM_memcpy_6T/2                          3.51      2.50    +28.8%
BM_memcpy_6T/8                          12.3      7.61    +38.1%
BM_memcpy_6T/64                          373       378     -1.3%
BM_memcpy_6T/512                       66871     66774     +0.1%
BM_memcpy_6T/4k                      5112975   5233502     -2.4%
BM_memcpy_6T/5k                      7614180   7772246     -2.1%
BM_memcpy_8T/2                          3.47      2.41    +30.5%
BM_memcpy_8T/8                          12.4      10.5    +15.3%
BM_memcpy_8T/64                          372       388     -4.3%
BM_memcpy_8T/512                       67373     66588     +1.2%
BM_memcpy_8T/4k                      5148462   5254897     -2.1%
BM_memcpy_8T/5k                      7660989   7799058     -1.8%
BM_memcpy_12T/2                         3.50      2.40    +31.4%
BM_memcpy_12T/8                         12.4      7.55    +39.1
BM_memcpy_12T/64                         374       378     -1.1%
BM_memcpy_12T/512                      67132     66683     +0.7%
BM_memcpy_12T/4k                     5185125   5292920     -2.1%
BM_memcpy_12T/5k                     7717284   7942684     -2.9%
BM_slicingSmallPieces_1T/2              47.3      47.5     +0.4%
BM_slicingSmallPieces_1T/8              53.6      52.3     +2.4%
BM_slicingSmallPieces_1T/64              491       476     +3.1%
BM_slicingSmallPieces_1T/512           21734     18814    +13.4%
BM_slicingSmallPieces_1T/4k           394660    396760     -0.5%
BM_slicingSmallPieces_1T/5k           218722    209244     +4.3%
BM_slicingSmallPieces_2T/2              80.7      79.9     +1.0%
BM_slicingSmallPieces_2T/8              54.2      53.1     +2.0
BM_slicingSmallPieces_2T/64              497       477     +4.0%
BM_slicingSmallPieces_2T/512           21732     18822    +13.4%
BM_slicingSmallPieces_2T/4k           392885    390490     +0.6%
BM_slicingSmallPieces_2T/5k           221988    208678     +6.0%
BM_slicingSmallPieces_4T/2              80.8      80.1     +0.9%
BM_slicingSmallPieces_4T/8              54.1      53.2     +1.7%
BM_slicingSmallPieces_4T/64              493       476     +3.4%
BM_slicingSmallPieces_4T/512           21702     18758    +13.6%
BM_slicingSmallPieces_4T/4k           393962    404023     -2.6%
BM_slicingSmallPieces_4T/5k           249667    211732    +15.2%
BM_slicingSmallPieces_6T/2              80.5      80.1     +0.5%
BM_slicingSmallPieces_6T/8              54.4      53.4     +1.8%
BM_slicingSmallPieces_6T/64              488       478     +2.0%
BM_slicingSmallPieces_6T/512           21719     18841    +13.3%
BM_slicingSmallPieces_6T/4k           394950    397583     -0.7%
BM_slicingSmallPieces_6T/5k           223080    210148     +5.8%
BM_slicingSmallPieces_8T/2              81.2      80.4     +1.0%
BM_slicingSmallPieces_8T/8              58.1      53.5     +7.9%
BM_slicingSmallPieces_8T/64              489       480     +1.8%
BM_slicingSmallPieces_8T/512           21586     18798    +12.9%
BM_slicingSmallPieces_8T/4k           394592    400165     -1.4%
BM_slicingSmallPieces_8T/5k           219688    208301     +5.2%
BM_slicingSmallPieces_12T/2             80.2      79.8     +0.7%
BM_slicingSmallPieces_12T/8             54.4      53.4     +1.8
BM_slicingSmallPieces_12T/64             488       476     +2.5%
BM_slicingSmallPieces_12T/512          21931     18831    +14.1%
BM_slicingSmallPieces_12T/4k          393962    396541     -0.7%
BM_slicingSmallPieces_12T/5k          218803    207965     +5.0%
---
 Eigen/src/Core/util/Memory.h                       | 61 +++++++++++++++++-----
 .../Eigen/CXX11/src/Tensor/TensorContraction.h     |  2 +-
 .../Eigen/CXX11/src/Tensor/TensorDeviceDefault.h   |  2 +-
 .../CXX11/src/Tensor/TensorDeviceThreadPool.h      |  2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h     |  4 +-
 5 files changed, 53 insertions(+), 18 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index c634d7ea0..6b8e307c8 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -63,7 +63,7 @@ namespace Eigen {
 
 namespace internal {
 
-EIGEN_DEVICE_FUNC 
+EIGEN_DEVICE_FUNC
 inline void throw_std_bad_alloc()
 {
   #ifdef EIGEN_EXCEPTIONS
@@ -74,6 +74,41 @@ inline void throw_std_bad_alloc()
   #endif
 }
 
+EIGEN_DEVICE_FUNC
+inline void fast_memcpy(void* dst, const void* src, size_t size) {
+#if defined(__CUDA__) || defined(__ANDROID__)
+  ::memcpy(dst, src, size);
+#else
+    switch(size) {
+    // Most compilers will generate inline code for fixed sizes,
+    // which is significantly faster for small copies.
+    case  1: memcpy(dst, src, 1); break;
+    case  2: memcpy(dst, src, 2); break;
+    case  3: memcpy(dst, src, 3); break;
+    case  4: memcpy(dst, src, 4); break;
+    case  5: memcpy(dst, src, 5); break;
+    case  6: memcpy(dst, src, 6); break;
+    case  7: memcpy(dst, src, 7); break;
+    case  8: memcpy(dst, src, 8); break;
+    case  9: memcpy(dst, src, 9); break;
+    case 10: memcpy(dst, src, 10); break;
+    case 11: memcpy(dst, src, 11); break;
+    case 12: memcpy(dst, src, 12); break;
+    case 13: memcpy(dst, src, 13); break;
+    case 14: memcpy(dst, src, 14); break;
+    case 15: memcpy(dst, src, 15); break;
+    case 16: memcpy(dst, src, 16); break;
+#ifdef EIGEN_OS_LINUX
+    // On Linux, memmove appears to be faster than memcpy for
+    // large sizes, strangely enough.
+    default: memmove(dst, src, size); break;
+#else
+    default: memcpy(dst, src, size); break;
+#endif
+    }
+#endif
+}
+
 /*****************************************************************************
 *** Implementation of handmade aligned functions                           ***
 *****************************************************************************/
@@ -114,7 +149,7 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t =
   void *previous_aligned = static_cast<char *>(original)+previous_offset;
   if(aligned!=previous_aligned)
     std::memmove(aligned, previous_aligned, size);
-  
+
   *(reinterpret_cast<void**>(aligned) - 1) = original;
   return aligned;
 }
@@ -142,7 +177,7 @@ EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
 {
   eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)");
 }
-#else 
+#else
 EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
 {}
 #endif
@@ -471,8 +506,8 @@ EIGEN_DEVICE_FUNC inline Index first_default_aligned(const Scalar* array, Index
 }
 
 /** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size
-  */ 
-template<typename Index> 
+  */
+template<typename Index>
 inline Index first_multiple(Index size, Index base)
 {
   return ((size+base-1)/base)*base;
@@ -502,7 +537,7 @@ template<typename T> struct smart_copy_helper<T,false> {
   { std::copy(start, end, target); }
 };
 
-// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise. 
+// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise.
 template<typename T, bool UseMemmove> struct smart_memmove_helper;
 
 template<typename T> void smart_memmove(const T* start, const T* end, T* target)
@@ -522,15 +557,15 @@ template<typename T> struct smart_memmove_helper<T,true> {
 
 template<typename T> struct smart_memmove_helper<T,false> {
   static inline void run(const T* start, const T* end, T* target)
-  { 
+  {
     if (UIntPtr(target) < UIntPtr(start))
     {
       std::copy(start, end, target);
     }
-    else                                 
+    else
     {
       std::ptrdiff_t count = (std::ptrdiff_t(end)-std::ptrdiff_t(start)) / sizeof(T);
-      std::copy_backward(start, end, target + count); 
+      std::copy_backward(start, end, target + count);
     }
   }
 };
@@ -603,7 +638,7 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
 {
   std::swap(a.ptr(),b.ptr());
 }
-    
+
 } // end namespace internal
 
 /** \internal
@@ -622,7 +657,7 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
   * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
   */
 #ifdef EIGEN_ALLOCA
-  
+
   #if EIGEN_DEFAULT_ALIGN_BYTES>0
     // We always manually re-align the result of EIGEN_ALLOCA.
     // If alloca is already aligned, the compiler should be smart enough to optimize away the re-alignment.
@@ -645,7 +680,7 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
     Eigen::internal::check_size_for_overflow<TYPE>(SIZE); \
     TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE));    \
     Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true)
-    
+
 #endif
 
 
@@ -701,7 +736,7 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
 * Example:
 * \code
 * // Matrix4f requires 16 bytes alignment:
-* std::map< int, Matrix4f, std::less<int>, 
+* std::map< int, Matrix4f, std::less<int>,
 *           aligned_allocator<std::pair<const int, Matrix4f> > > my_map_mat4;
 * // Vector3f does not require 16 bytes alignment, no need to use Eigen's allocator:
 * std::map< int, Vector3f > my_map_vec3;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 442c14fac..39012b937 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -56,7 +56,7 @@ void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index
   } else {
     // Naive memcpy calls
     for (Index col = 0; col < cols; ++col) {
-      memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar));
+      internal::fast_memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar));
     }
   }
 }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
index ccaaa6cb2..b133781ae 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -22,7 +22,7 @@ struct DefaultDevice {
     internal::aligned_free(buffer);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-    ::memcpy(dst, src, n);
+    internal::fast_memcpy(dst, src, n);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
     memcpy(dst, src, n);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index 16180ca69..facdea735 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -106,7 +106,7 @@ struct ThreadPoolDevice {
   }
 
   EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-    ::memcpy(dst, src, n);
+    internal::fast_memcpy(dst, src, n);
   }
   EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
     memcpy(dst, src, n);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index 08eb5595a..f060191ab 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -253,7 +253,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
         // get data into line_buf
         const Index stride = m_strides[dim];
         if (stride == 1) {
-          memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
+          m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
         } else {
           Index offset = base_offset;
           for (int j = 0; j < line_len; ++j, offset += stride) {
@@ -271,7 +271,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
         // write back
         if (FFTDir == FFT_FORWARD && stride == 1) {
-          memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
+          m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
         } else {
           Index offset = base_offset;
           const ComplexScalar div_factor =  ComplexScalar(1.0 / line_len, 0);
-- 
cgit v1.2.3


From edaa0fc5d1319823393b02b002880fc7a1fa49e9 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Thu, 26 Jan 2017 12:46:06 -0800
Subject: Revert PR-292. After further investigation, the memcpy->memmove
 change was only good for Haswell on older versions of glibc. Adding a switch
 for small sizes is perhaps useful for string copies, but also has an overhead
 for larger sizes, making it a poor trade-off for general memcpy.

This PR also removes a couple of unnecessary semi-colons in Eigen/src/Core/AssignEvaluator.h that caused compiler warning everywhere.
---
 Eigen/src/Core/AssignEvaluator.h                   |  4 +--
 Eigen/src/Core/util/Memory.h                       | 37 +---------------------
 .../Eigen/CXX11/src/Tensor/TensorContraction.h     |  2 +-
 .../Eigen/CXX11/src/Tensor/TensorDeviceDefault.h   |  2 +-
 .../CXX11/src/Tensor/TensorDeviceThreadPool.h      |  2 +-
 5 files changed, 6 insertions(+), 41 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h')

diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index 7c7203ac6..489935b83 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -708,7 +708,7 @@ void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const Functor &/*
   EIGEN_ONLY_USED_FOR_DEBUG(dst);
   EIGEN_ONLY_USED_FOR_DEBUG(src);
   eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
-};
+}
 
 template<typename DstXprType,typename SrcXprType, typename T1, typename T2>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -719,7 +719,7 @@ void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const internal::a
   if(((dst.rows()!=dstRows) || (dst.cols()!=dstCols)))
     dst.resize(dstRows, dstCols);
   eigen_assert(dst.rows() == dstRows && dst.cols() == dstCols);
-};
+}
 
 template<typename DstXprType, typename SrcXprType, typename Functor>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func)
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 572b1fe69..7d9053496 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -74,41 +74,6 @@ inline void throw_std_bad_alloc()
   #endif
 }
 
-EIGEN_DEVICE_FUNC
-inline void fast_memcpy(void* dst, const void* src, size_t size) {
-#if defined(__CUDA__) || defined(__ANDROID__)
-  ::memcpy(dst, src, size);
-#else
-    switch(size) {
-    // Most compilers will generate inline code for fixed sizes,
-    // which is significantly faster for small copies.
-    case  1: memcpy(dst, src, 1); break;
-    case  2: memcpy(dst, src, 2); break;
-    case  3: memcpy(dst, src, 3); break;
-    case  4: memcpy(dst, src, 4); break;
-    case  5: memcpy(dst, src, 5); break;
-    case  6: memcpy(dst, src, 6); break;
-    case  7: memcpy(dst, src, 7); break;
-    case  8: memcpy(dst, src, 8); break;
-    case  9: memcpy(dst, src, 9); break;
-    case 10: memcpy(dst, src, 10); break;
-    case 11: memcpy(dst, src, 11); break;
-    case 12: memcpy(dst, src, 12); break;
-    case 13: memcpy(dst, src, 13); break;
-    case 14: memcpy(dst, src, 14); break;
-    case 15: memcpy(dst, src, 15); break;
-    case 16: memcpy(dst, src, 16); break;
-#ifdef EIGEN_OS_LINUX
-    // On Linux, memmove appears to be faster than memcpy for
-    // large sizes, strangely enough.
-    default: memmove(dst, src, size); break;
-#else
-    default: memcpy(dst, src, size); break;
-#endif
-    }
-#endif
-}
-
 /*****************************************************************************
 *** Implementation of handmade aligned functions                           ***
 *****************************************************************************/
@@ -528,7 +493,7 @@ template<typename T> struct smart_copy_helper<T,true> {
     IntPtr size = IntPtr(end)-IntPtr(start);
     if(size==0) return;
     eigen_internal_assert(start!=0 && end!=0 && target!=0);
-    fast_memcpy(target, start, size);
+    memcpy(target, start, size);
   }
 };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 39012b937..442c14fac 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -56,7 +56,7 @@ void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index
   } else {
     // Naive memcpy calls
     for (Index col = 0; col < cols; ++col) {
-      internal::fast_memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar));
+      memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar));
     }
   }
 }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
index b133781ae..ccaaa6cb2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -22,7 +22,7 @@ struct DefaultDevice {
     internal::aligned_free(buffer);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-    internal::fast_memcpy(dst, src, n);
+    ::memcpy(dst, src, n);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
     memcpy(dst, src, n);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index facdea735..16180ca69 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -106,7 +106,7 @@ struct ThreadPoolDevice {
   }
 
   EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-    internal::fast_memcpy(dst, src, n);
+    ::memcpy(dst, src, n);
   }
   EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
     memcpy(dst, src, n);
-- 
cgit v1.2.3