From f9eff17e915e270e654287723cea67be495f5c5f Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 21 Dec 2016 12:32:06 -0800 Subject: Leverage libxsmm kernels within signle threaded contractions --- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 291 ++++++++++++++++++++- 1 file changed, 289 insertions(+), 2 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 2ac6abf69..c446ba1af 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -20,6 +20,70 @@ namespace Eigen { * */ namespace internal { +#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) +template +void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index lddst, Index ldsrc) { + size_t psize = packet_traits::size; // Packet size + typedef typename packet_traits::type Packet; // Packet type + size_t alignment = psize*sizeof(Scalar); // Needed alignment + if (rows % psize == 0 && (lddst*sizeof(Scalar)) % alignment == 0 && + (ldsrc*sizeof(Scalar)) % alignment == 0 && + reinterpret_cast(src) % alignment == 0 && + reinterpret_cast(dst) % alignment == 0) { + // Optimized version using packets + size_t num_packets = rows / psize; + for (Index col = 0; col < cols; ++col) { + EIGEN_ASM_COMMENT("begin pack_simple inner copy"); + // Unrolled manually 4 times. + for (size_t i=0; i < num_packets/4; ++i) { + internal::pstore(dst, internal::pload(src)); + dst += psize; src += psize; + internal::pstore(dst, internal::pload(src)); + dst += psize; src += psize; + internal::pstore(dst, internal::pload(src)); + dst += psize; src += psize; + internal::pstore(dst, internal::pload(src)); + dst += psize; src += psize; + } + for (size_t i=0; i < num_packets%4; ++i) { + internal::pstore(dst, internal::pload(src)); + dst += psize; src += psize; + } + dst += lddst - num_packets*psize; + src += ldsrc - num_packets*psize; + EIGEN_ASM_COMMENT("end pack_simple inner copy"); + } + } else { + // Naive memcpy calls + for (Index col = 0; col < cols; ++col) { + memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar)); + } + } +} + +template + struct libxsmm_wrapper { + libxsmm_wrapper() {} + libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) {} + void operator()(const LhsScalar* a, const RhsScalar* b, Scalar* c) {} + void operator()(const LhsScalar* a, const RhsScalar* b, Scalar* c, const LhsScalar* ap, const RhsScalar* bp, const Scalar* cp) {} + }; + + template<> + struct libxsmm_wrapper: public libxsmm_mmfunction { + libxsmm_wrapper(): libxsmm_mmfunction() {} + libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) : + libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {} + }; + + template<> + struct libxsmm_wrapper: public libxsmm_mmfunction { + libxsmm_wrapper(): libxsmm_mmfunction() {} + libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) : + libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {} + }; +#endif + template struct traits > @@ -317,6 +381,8 @@ struct TensorContractionEvaluatorBase } } + EnableXSMMIfPossible(eval_op_indices); + // If the layout is RowMajor, we need to reverse the m_dimensions if (static_cast(Layout) == static_cast(RowMajor)) { for (int i = 0, j = NumDims - 1; i < j; i++, j--) { @@ -422,6 +488,13 @@ struct TensorContractionEvaluatorBase template EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { + #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) + if (m_can_use_xsmm) { + evalGemmXSMM(buffer); + return; + } + #endif + // columns in left side, rows in right side const Index k = this->m_k_size; @@ -538,7 +611,221 @@ struct TensorContractionEvaluatorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; } - protected: +protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void EnableXSMMIfPossible(const array, ContractDims>& eval_op_indices) { + m_can_use_xsmm = false; + +#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + if (!std::is_same::value || + !std::is_same::value || + !(std::is_same::value || + std::is_same::value) || + m_leftImpl.data() == NULL || + m_rightImpl.data() == NULL) { + return; + } + + // Check if we can use faster matmul algorithms. For contraction to be + // equivalent to matmul, we need both lhs and rhs contracting dims sequences + // to be either a prefix or suffix of all dims. Also, the order of both + // must be the same, so we don't have to do reordering. + // For example: + // * OK: lhs 4D, rhs 4D, contraction: [(0, 2), (1, 3)] + // * BAD: lhs 3D, rhs 3D, contraction: [(1,1)] + // * BAD: lhs 3D, rhs 3D, contraction: [(0, 0), (2, 2)] + // * BAD: lhs 3D, rhs 3D, contraction: [(0, 2), (1, 1)] + // Depending if contraction dims are prefix or suffix of all dims we need to + // pre-transpose matrices in matmul algorithm: + // lhs: prefix -> transpose, suffix -> no transpose + // rhs: prefix -> no transpose, suffix -> transpose + // For example, for lhs 2D, rhs 2D, contraction [(1, 0)] is regular, + // non-transposed matmul. + if (ContractDims == 0) { + // This case is totally uninteresting, filter it out to avoid problems + // with iterations in further tests. + return; + } + + // Check if RHS dims list is increasing. LHS already is, so if not, the + // order is different and we cannot do matmul. + for (int i = 1; i < ContractDims; i++) { + if (eval_op_indices[i].second < eval_op_indices[i-1].second) { + return; + } + } + + // Check if no holes. + int diff; + for (int i = 1; i < ContractDims; i++) { + // LHS contract dims are sorted to form an increasing seq. + diff = eval_op_indices[i].first - eval_op_indices[i-1].first; + if (diff != 1) { + return; + } + // Now we may already assume RHS contract dims seq is increasing too. + diff = eval_op_indices[i].second - eval_op_indices[i-1].second; + if (diff != 1) { + return; + } + } + + // Check if suffix or prefix. + if (eval_op_indices[0].first != 0 && + eval_op_indices[ContractDims-1].first != LDims-1) { + return; + } + if (eval_op_indices[0].second != 0 && + eval_op_indices[ContractDims-1].second != RDims-1) { + return; + } + + m_can_use_xsmm = true; + #endif + } + +#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) + EIGEN_DEVICE_FUNC void evalGemmXSMM(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + const bool transposeA = !m_lhs_inner_dim_contiguous; + const bool transposeB = !m_rhs_inner_dim_contiguous; + + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + + internal::TensorXsmmContractionBlocking blocking( + k, m, n, 1, transposeA, transposeB); + + // Outer blocks sizes + const Index mc_outer = blocking.outer_m(); + const Index nc_outer = blocking.outer_n(); + const Index kc_outer = blocking.outer_k(); + // Inner blocks sizes + const Index mc = blocking.mc(); + const Index nc = blocking.nc(); + const Index kc = blocking.kc(); + // Decisions whether we should copy parts of matrices + const bool copyA = blocking.copyA(); + const bool copyB = blocking.copyB(); + + const LhsScalar* leftData = m_leftImpl.data(); + const RhsScalar* rightData = m_rightImpl.data(); + + libxsmm_blasint stride_A = static_cast(transposeA ? k : m); + libxsmm_blasint stride_B = static_cast(transposeB ? n : k); + libxsmm_blasint stride_C = static_cast(m); + + libxsmm_blasint stride_blockA = static_cast(mc); + // Use bigger stride to avoid hitting same cache line too often. + // This consistently gives +~0.5 Gflops. + libxsmm_blasint stride_panelB = static_cast( + kc % 32 == 0 ? kc + 16 : kc + ); + + // Kernel for the general case (not edges) + internal::libxsmm_wrapper kernel; + + const LhsScalar *ap; + const RhsScalar *bp; + const Scalar *cp; + + LhsScalar* blockA = NULL; + RhsScalar* panelB = NULL; + + if (copyA) { + blockA = static_cast(this->m_device.allocate(mc * kc * sizeof(LhsScalar))); + } + if (copyB) { + panelB = static_cast(this->m_device.allocate(nc_outer * stride_panelB * sizeof(RhsScalar))); + } + + Index kernel_stride_A = copyA ? stride_blockA : stride_A; + Index kernel_stride_B = copyB ? stride_panelB : stride_B; + kernel = internal::libxsmm_wrapper(0, mc, nc, kc, kernel_stride_A, kernel_stride_B, stride_C, 1, 1, blocking.prefetch()); + + // Outer blocking + for (Index ki_outer = 0; ki_outer < k; ki_outer += kc_outer) { + for (Index mi_outer = 0; mi_outer < m; mi_outer += mc_outer) { + for (Index ni_outer = 0; ni_outer < n; ni_outer += nc_outer) { + using numext::mini; + + Index actual_nc_outer = mini(ni_outer+nc_outer, n) - ni_outer; + + // Inner blocking + for (Index ki = ki_outer; ki < mini(ki_outer+kc_outer, k); ki += kc) { + const Index actual_kc = mini(ki_outer+kc_outer, mini(ki+kc, k)) - ki; + + if (copyB) { + if (transposeB) { + libxsmm_otrans(panelB, rightData + ki*stride_B + ni_outer, sizeof(RhsScalar), actual_nc_outer, actual_kc, stride_B, stride_panelB); + } else { + internal::pack_simple(panelB, rightData + ni_outer*stride_B + ki, actual_nc_outer, actual_kc, stride_panelB, stride_B); + } + } + + for (Index mi = mi_outer; mi < mini(mi_outer+mc_outer, m); mi += mc) { + const Index actual_mc = mini(mi_outer+mc_outer, mini(mi+mc, m)) - mi; + + const LhsScalar * a = transposeA ? leftData + mi*stride_A + ki : + leftData + ki*stride_A + mi; + + if (copyA) { + if (transposeA) { + libxsmm_otrans(blockA, a, sizeof(LhsScalar), actual_kc, actual_mc, stride_A, stride_blockA); + } else { + internal::pack_simple(blockA, a, actual_kc, actual_mc, stride_blockA, stride_A); + } + } + + for (Index ni = ni_outer; ni < mini(ni_outer+nc_outer, n); ni += nc) { + const Index actual_nc = mini(ni_outer+nc_outer, mini(ni+nc, n)) - ni; + + const RhsScalar * b = rightData + ni*stride_B + ki; + Scalar * c = buffer + ni*stride_C + mi; + cp = c + nc*stride_C; + + const LhsScalar * actual_a = copyA ? blockA : a; + const Index actual_lda = copyA ? stride_blockA : stride_A; + ap = copyA ? blockA : a; + + const RhsScalar * actual_b = copyB ? panelB + (ni-ni_outer)*stride_panelB : b; + const Index actual_ldb = copyB ? stride_panelB : stride_B; + bp = copyB ? panelB + nc*stride_panelB : b + nc*stride_B; + + float beta = ki == 0 ? 0 : 1; + + if (actual_mc == mc && actual_kc == kc && actual_nc == nc && beta == 1) { + // Most used, cached kernel. + kernel(actual_a, actual_b, c, ap, bp, cp); + } else { + // Edges - use libxsmm kernel cache. + internal::libxsmm_wrapper(0, actual_mc, actual_nc, actual_kc, actual_lda, actual_ldb, stride_C, 1, beta, blocking.prefetch())(actual_a, actual_b, c, ap, bp, cp); + } + } + } + } + } + } + } + + if (copyA) { + this->m_device.deallocate(blockA); + } + if (copyB) { + this->m_device.deallocate(panelB); + } + } +#endif + // Prevent assignment TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&); Dimensions m_dimensions; @@ -567,6 +854,7 @@ struct TensorContractionEvaluatorBase /// required for sycl const Indices m_expr_indices; + bool m_can_use_xsmm; }; @@ -624,7 +912,6 @@ struct TensorEvaluatortemplate evalGemm(buffer); } - }; } // end namespace Eigen -- cgit v1.2.3 From 4236aebe103b0fa54f3b9e7e3c0c12094fa6e200 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 21 Dec 2016 16:42:56 -0800 Subject: Simplified the contraction code` --- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 45 +++++++++------------- 1 file changed, 18 insertions(+), 27 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index c446ba1af..442c14fac 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -720,24 +720,20 @@ protected: const LhsScalar* leftData = m_leftImpl.data(); const RhsScalar* rightData = m_rightImpl.data(); - libxsmm_blasint stride_A = static_cast(transposeA ? k : m); - libxsmm_blasint stride_B = static_cast(transposeB ? n : k); - libxsmm_blasint stride_C = static_cast(m); + const libxsmm_blasint stride_A = static_cast(transposeA ? k : m); + const libxsmm_blasint stride_B = static_cast(transposeB ? n : k); + const libxsmm_blasint stride_C = static_cast(m); - libxsmm_blasint stride_blockA = static_cast(mc); + const libxsmm_blasint stride_blockA = static_cast(mc); // Use bigger stride to avoid hitting same cache line too often. // This consistently gives +~0.5 Gflops. - libxsmm_blasint stride_panelB = static_cast( + const libxsmm_blasint stride_panelB = static_cast( kc % 32 == 0 ? kc + 16 : kc ); // Kernel for the general case (not edges) internal::libxsmm_wrapper kernel; - const LhsScalar *ap; - const RhsScalar *bp; - const Scalar *cp; - LhsScalar* blockA = NULL; RhsScalar* panelB = NULL; @@ -748,8 +744,8 @@ protected: panelB = static_cast(this->m_device.allocate(nc_outer * stride_panelB * sizeof(RhsScalar))); } - Index kernel_stride_A = copyA ? stride_blockA : stride_A; - Index kernel_stride_B = copyB ? stride_panelB : stride_B; + const Index kernel_stride_A = copyA ? stride_blockA : stride_A; + const Index kernel_stride_B = copyB ? stride_panelB : stride_B; kernel = internal::libxsmm_wrapper(0, mc, nc, kc, kernel_stride_A, kernel_stride_B, stride_C, 1, 1, blocking.prefetch()); // Outer blocking @@ -763,6 +759,7 @@ protected: // Inner blocking for (Index ki = ki_outer; ki < mini(ki_outer+kc_outer, k); ki += kc) { const Index actual_kc = mini(ki_outer+kc_outer, mini(ki+kc, k)) - ki; + const float beta = ki == 0 ? 0 : 1; if (copyB) { if (transposeB) { @@ -775,8 +772,8 @@ protected: for (Index mi = mi_outer; mi < mini(mi_outer+mc_outer, m); mi += mc) { const Index actual_mc = mini(mi_outer+mc_outer, mini(mi+mc, m)) - mi; - const LhsScalar * a = transposeA ? leftData + mi*stride_A + ki : - leftData + ki*stride_A + mi; + const LhsScalar* a = transposeA ? leftData + mi*stride_A + ki : + leftData + ki*stride_A + mi; if (copyA) { if (transposeA) { @@ -785,30 +782,24 @@ protected: internal::pack_simple(blockA, a, actual_kc, actual_mc, stride_blockA, stride_A); } } + const LhsScalar* actual_a = copyA ? blockA : a; for (Index ni = ni_outer; ni < mini(ni_outer+nc_outer, n); ni += nc) { const Index actual_nc = mini(ni_outer+nc_outer, mini(ni+nc, n)) - ni; - const RhsScalar * b = rightData + ni*stride_B + ki; - Scalar * c = buffer + ni*stride_C + mi; - cp = c + nc*stride_C; - - const LhsScalar * actual_a = copyA ? blockA : a; - const Index actual_lda = copyA ? stride_blockA : stride_A; - ap = copyA ? blockA : a; - - const RhsScalar * actual_b = copyB ? panelB + (ni-ni_outer)*stride_panelB : b; - const Index actual_ldb = copyB ? stride_panelB : stride_B; - bp = copyB ? panelB + nc*stride_panelB : b + nc*stride_B; + const RhsScalar* b = rightData + ni*stride_B + ki; + Scalar* c = buffer + ni*stride_C + mi; + const Scalar* cp = c + nc*stride_C; - float beta = ki == 0 ? 0 : 1; + const RhsScalar* actual_b = copyB ? panelB + (ni-ni_outer)*stride_panelB : b; + const RhsScalar* bp = copyB ? panelB + nc*stride_panelB : b + nc*stride_B; if (actual_mc == mc && actual_kc == kc && actual_nc == nc && beta == 1) { // Most used, cached kernel. - kernel(actual_a, actual_b, c, ap, bp, cp); + kernel(actual_a, actual_b, c, actual_a, bp, cp); } else { // Edges - use libxsmm kernel cache. - internal::libxsmm_wrapper(0, actual_mc, actual_nc, actual_kc, actual_lda, actual_ldb, stride_C, 1, beta, blocking.prefetch())(actual_a, actual_b, c, ap, bp, cp); + internal::libxsmm_wrapper(0, actual_mc, actual_nc, actual_kc, kernel_stride_A, kernel_stride_B, stride_C, 1, beta, blocking.prefetch())(actual_a, actual_b, c, actual_a, bp, cp); } } } -- cgit v1.2.3 From e6b10202218631be755f19c41fe01287b9a37f90 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Tue, 24 Jan 2017 13:55:18 -0800 Subject: Adds a fast memcpy function to Eigen. This takes advantage of the following: 1. For small fixed sizes, the compiler generates inline code for memcpy, which is much faster. 2. My colleague eriche at googl dot com discovered that for large sizes, memmove is significantly faster than memcpy (at least on Linux with GCC or Clang). See benchmark numbers measured on a Haswell (HP Z440) workstation here: https://docs.google.com/a/google.com/spreadsheets/d/1jLs5bKzXwhpTySw65MhG1pZpsIwkszZqQTjwrd_n0ic/pubhtml This is of course surprising since memcpy is a less constrained version of memmove. This stackoverflow thread contains some speculation as to the causes: http://stackoverflow.com/questions/22793669/poor-memcpy-performance-on-linux Below are numbers for copying and slicing tensors using the multithreaded TensorDevice. The numbers show significant improvements for memcpy of very small blocks and for memcpy of large blocks single threaded (we were already able to saturate memory bandwidth for >1 threads before on large blocks). The "slicingSmallPieces" benchmark also shows small consistent improvements, since memcpy cost is a fair portion of that particular computation. The benchmarks operate on NxN matrices, and the names are of the form BM_$OP_${NUMTHREADS}T/${N}. Measured improvements in wall clock time: Run on rmlarsen3.mtv (12 X 3501 MHz CPUs); 2017-01-20T11:26:31.493023454-08:00 CPU: Intel Haswell with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:15MB Benchmark Base (ns) New (ns) Improvement ------------------------------------------------------------------ BM_memcpy_1T/2 3.48 2.39 +31.3% BM_memcpy_1T/8 12.3 6.51 +47.0% BM_memcpy_1T/64 371 383 -3.2% BM_memcpy_1T/512 66922 66720 +0.3% BM_memcpy_1T/4k 9892867 6849682 +30.8% BM_memcpy_1T/5k 14951099 10332856 +30.9% BM_memcpy_2T/2 3.50 2.46 +29.7% BM_memcpy_2T/8 12.3 7.66 +37.7% BM_memcpy_2T/64 371 376 -1.3% BM_memcpy_2T/512 66652 66788 -0.2% BM_memcpy_2T/4k 6145012 6117776 +0.4% BM_memcpy_2T/5k 9181478 9010942 +1.9% BM_memcpy_4T/2 3.47 2.47 +31.0% BM_memcpy_4T/8 12.3 6.67 +45.8 BM_memcpy_4T/64 374 376 -0.5% BM_memcpy_4T/512 67833 68019 -0.3% BM_memcpy_4T/4k 5057425 5188253 -2.6% BM_memcpy_4T/5k 7555638 7779468 -3.0% BM_memcpy_6T/2 3.51 2.50 +28.8% BM_memcpy_6T/8 12.3 7.61 +38.1% BM_memcpy_6T/64 373 378 -1.3% BM_memcpy_6T/512 66871 66774 +0.1% BM_memcpy_6T/4k 5112975 5233502 -2.4% BM_memcpy_6T/5k 7614180 7772246 -2.1% BM_memcpy_8T/2 3.47 2.41 +30.5% BM_memcpy_8T/8 12.4 10.5 +15.3% BM_memcpy_8T/64 372 388 -4.3% BM_memcpy_8T/512 67373 66588 +1.2% BM_memcpy_8T/4k 5148462 5254897 -2.1% BM_memcpy_8T/5k 7660989 7799058 -1.8% BM_memcpy_12T/2 3.50 2.40 +31.4% BM_memcpy_12T/8 12.4 7.55 +39.1 BM_memcpy_12T/64 374 378 -1.1% BM_memcpy_12T/512 67132 66683 +0.7% BM_memcpy_12T/4k 5185125 5292920 -2.1% BM_memcpy_12T/5k 7717284 7942684 -2.9% BM_slicingSmallPieces_1T/2 47.3 47.5 +0.4% BM_slicingSmallPieces_1T/8 53.6 52.3 +2.4% BM_slicingSmallPieces_1T/64 491 476 +3.1% BM_slicingSmallPieces_1T/512 21734 18814 +13.4% BM_slicingSmallPieces_1T/4k 394660 396760 -0.5% BM_slicingSmallPieces_1T/5k 218722 209244 +4.3% BM_slicingSmallPieces_2T/2 80.7 79.9 +1.0% BM_slicingSmallPieces_2T/8 54.2 53.1 +2.0 BM_slicingSmallPieces_2T/64 497 477 +4.0% BM_slicingSmallPieces_2T/512 21732 18822 +13.4% BM_slicingSmallPieces_2T/4k 392885 390490 +0.6% BM_slicingSmallPieces_2T/5k 221988 208678 +6.0% BM_slicingSmallPieces_4T/2 80.8 80.1 +0.9% BM_slicingSmallPieces_4T/8 54.1 53.2 +1.7% BM_slicingSmallPieces_4T/64 493 476 +3.4% BM_slicingSmallPieces_4T/512 21702 18758 +13.6% BM_slicingSmallPieces_4T/4k 393962 404023 -2.6% BM_slicingSmallPieces_4T/5k 249667 211732 +15.2% BM_slicingSmallPieces_6T/2 80.5 80.1 +0.5% BM_slicingSmallPieces_6T/8 54.4 53.4 +1.8% BM_slicingSmallPieces_6T/64 488 478 +2.0% BM_slicingSmallPieces_6T/512 21719 18841 +13.3% BM_slicingSmallPieces_6T/4k 394950 397583 -0.7% BM_slicingSmallPieces_6T/5k 223080 210148 +5.8% BM_slicingSmallPieces_8T/2 81.2 80.4 +1.0% BM_slicingSmallPieces_8T/8 58.1 53.5 +7.9% BM_slicingSmallPieces_8T/64 489 480 +1.8% BM_slicingSmallPieces_8T/512 21586 18798 +12.9% BM_slicingSmallPieces_8T/4k 394592 400165 -1.4% BM_slicingSmallPieces_8T/5k 219688 208301 +5.2% BM_slicingSmallPieces_12T/2 80.2 79.8 +0.7% BM_slicingSmallPieces_12T/8 54.4 53.4 +1.8 BM_slicingSmallPieces_12T/64 488 476 +2.5% BM_slicingSmallPieces_12T/512 21931 18831 +14.1% BM_slicingSmallPieces_12T/4k 393962 396541 -0.7% BM_slicingSmallPieces_12T/5k 218803 207965 +5.0% --- Eigen/src/Core/util/Memory.h | 61 +++++++++++++++++----- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorDeviceDefault.h | 2 +- .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 4 +- 5 files changed, 53 insertions(+), 18 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index c634d7ea0..6b8e307c8 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -63,7 +63,7 @@ namespace Eigen { namespace internal { -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC inline void throw_std_bad_alloc() { #ifdef EIGEN_EXCEPTIONS @@ -74,6 +74,41 @@ inline void throw_std_bad_alloc() #endif } +EIGEN_DEVICE_FUNC +inline void fast_memcpy(void* dst, const void* src, size_t size) { +#if defined(__CUDA__) || defined(__ANDROID__) + ::memcpy(dst, src, size); +#else + switch(size) { + // Most compilers will generate inline code for fixed sizes, + // which is significantly faster for small copies. + case 1: memcpy(dst, src, 1); break; + case 2: memcpy(dst, src, 2); break; + case 3: memcpy(dst, src, 3); break; + case 4: memcpy(dst, src, 4); break; + case 5: memcpy(dst, src, 5); break; + case 6: memcpy(dst, src, 6); break; + case 7: memcpy(dst, src, 7); break; + case 8: memcpy(dst, src, 8); break; + case 9: memcpy(dst, src, 9); break; + case 10: memcpy(dst, src, 10); break; + case 11: memcpy(dst, src, 11); break; + case 12: memcpy(dst, src, 12); break; + case 13: memcpy(dst, src, 13); break; + case 14: memcpy(dst, src, 14); break; + case 15: memcpy(dst, src, 15); break; + case 16: memcpy(dst, src, 16); break; +#ifdef EIGEN_OS_LINUX + // On Linux, memmove appears to be faster than memcpy for + // large sizes, strangely enough. + default: memmove(dst, src, size); break; +#else + default: memcpy(dst, src, size); break; +#endif + } +#endif +} + /***************************************************************************** *** Implementation of handmade aligned functions *** *****************************************************************************/ @@ -114,7 +149,7 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = void *previous_aligned = static_cast(original)+previous_offset; if(aligned!=previous_aligned) std::memmove(aligned, previous_aligned, size); - + *(reinterpret_cast(aligned) - 1) = original; return aligned; } @@ -142,7 +177,7 @@ EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() { eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)"); } -#else +#else EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() {} #endif @@ -471,8 +506,8 @@ EIGEN_DEVICE_FUNC inline Index first_default_aligned(const Scalar* array, Index } /** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size - */ -template + */ +template inline Index first_multiple(Index size, Index base) { return ((size+base-1)/base)*base; @@ -502,7 +537,7 @@ template struct smart_copy_helper { { std::copy(start, end, target); } }; -// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise. +// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise. template struct smart_memmove_helper; template void smart_memmove(const T* start, const T* end, T* target) @@ -522,15 +557,15 @@ template struct smart_memmove_helper { template struct smart_memmove_helper { static inline void run(const T* start, const T* end, T* target) - { + { if (UIntPtr(target) < UIntPtr(start)) { std::copy(start, end, target); } - else + else { std::ptrdiff_t count = (std::ptrdiff_t(end)-std::ptrdiff_t(start)) / sizeof(T); - std::copy_backward(start, end, target + count); + std::copy_backward(start, end, target + count); } } }; @@ -603,7 +638,7 @@ template void swap(scoped_array &a,scoped_array &b) { std::swap(a.ptr(),b.ptr()); } - + } // end namespace internal /** \internal @@ -622,7 +657,7 @@ template void swap(scoped_array &a,scoped_array &b) * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token. */ #ifdef EIGEN_ALLOCA - + #if EIGEN_DEFAULT_ALIGN_BYTES>0 // We always manually re-align the result of EIGEN_ALLOCA. // If alloca is already aligned, the compiler should be smart enough to optimize away the re-alignment. @@ -645,7 +680,7 @@ template void swap(scoped_array &a,scoped_array &b) Eigen::internal::check_size_for_overflow(SIZE); \ TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE)); \ Eigen::internal::aligned_stack_memory_handler EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true) - + #endif @@ -701,7 +736,7 @@ template void swap(scoped_array &a,scoped_array &b) * Example: * \code * // Matrix4f requires 16 bytes alignment: -* std::map< int, Matrix4f, std::less, +* std::map< int, Matrix4f, std::less, * aligned_allocator > > my_map_mat4; * // Vector3f does not require 16 bytes alignment, no need to use Eigen's allocator: * std::map< int, Vector3f > my_map_vec3; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 442c14fac..39012b937 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -56,7 +56,7 @@ void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index } else { // Naive memcpy calls for (Index col = 0; col < cols; ++col) { - memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar)); + internal::fast_memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar)); } } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index ccaaa6cb2..b133781ae 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -22,7 +22,7 @@ struct DefaultDevice { internal::aligned_free(buffer); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { - ::memcpy(dst, src, n); + internal::fast_memcpy(dst, src, n); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { memcpy(dst, src, n); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index 16180ca69..facdea735 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -106,7 +106,7 @@ struct ThreadPoolDevice { } EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { - ::memcpy(dst, src, n); + internal::fast_memcpy(dst, src, n); } EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { memcpy(dst, src, n); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 08eb5595a..f060191ab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -253,7 +253,7 @@ struct TensorEvaluator, D // get data into line_buf const Index stride = m_strides[dim]; if (stride == 1) { - memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); + m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); } else { Index offset = base_offset; for (int j = 0; j < line_len; ++j, offset += stride) { @@ -271,7 +271,7 @@ struct TensorEvaluator, D // write back if (FFTDir == FFT_FORWARD && stride == 1) { - memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); + m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); } else { Index offset = base_offset; const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0); -- cgit v1.2.3 From edaa0fc5d1319823393b02b002880fc7a1fa49e9 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 26 Jan 2017 12:46:06 -0800 Subject: Revert PR-292. After further investigation, the memcpy->memmove change was only good for Haswell on older versions of glibc. Adding a switch for small sizes is perhaps useful for string copies, but also has an overhead for larger sizes, making it a poor trade-off for general memcpy. This PR also removes a couple of unnecessary semi-colons in Eigen/src/Core/AssignEvaluator.h that caused compiler warning everywhere. --- Eigen/src/Core/AssignEvaluator.h | 4 +-- Eigen/src/Core/util/Memory.h | 37 +--------------------- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorDeviceDefault.h | 2 +- .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 2 +- 5 files changed, 6 insertions(+), 41 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h') diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 7c7203ac6..489935b83 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -708,7 +708,7 @@ void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const Functor &/* EIGEN_ONLY_USED_FOR_DEBUG(dst); EIGEN_ONLY_USED_FOR_DEBUG(src); eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); -}; +} template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -719,7 +719,7 @@ void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const internal::a if(((dst.rows()!=dstRows) || (dst.cols()!=dstCols))) dst.resize(dstRows, dstCols); eigen_assert(dst.rows() == dstRows && dst.cols() == dstCols); -}; +} template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 572b1fe69..7d9053496 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -74,41 +74,6 @@ inline void throw_std_bad_alloc() #endif } -EIGEN_DEVICE_FUNC -inline void fast_memcpy(void* dst, const void* src, size_t size) { -#if defined(__CUDA__) || defined(__ANDROID__) - ::memcpy(dst, src, size); -#else - switch(size) { - // Most compilers will generate inline code for fixed sizes, - // which is significantly faster for small copies. - case 1: memcpy(dst, src, 1); break; - case 2: memcpy(dst, src, 2); break; - case 3: memcpy(dst, src, 3); break; - case 4: memcpy(dst, src, 4); break; - case 5: memcpy(dst, src, 5); break; - case 6: memcpy(dst, src, 6); break; - case 7: memcpy(dst, src, 7); break; - case 8: memcpy(dst, src, 8); break; - case 9: memcpy(dst, src, 9); break; - case 10: memcpy(dst, src, 10); break; - case 11: memcpy(dst, src, 11); break; - case 12: memcpy(dst, src, 12); break; - case 13: memcpy(dst, src, 13); break; - case 14: memcpy(dst, src, 14); break; - case 15: memcpy(dst, src, 15); break; - case 16: memcpy(dst, src, 16); break; -#ifdef EIGEN_OS_LINUX - // On Linux, memmove appears to be faster than memcpy for - // large sizes, strangely enough. - default: memmove(dst, src, size); break; -#else - default: memcpy(dst, src, size); break; -#endif - } -#endif -} - /***************************************************************************** *** Implementation of handmade aligned functions *** *****************************************************************************/ @@ -528,7 +493,7 @@ template struct smart_copy_helper { IntPtr size = IntPtr(end)-IntPtr(start); if(size==0) return; eigen_internal_assert(start!=0 && end!=0 && target!=0); - fast_memcpy(target, start, size); + memcpy(target, start, size); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 39012b937..442c14fac 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -56,7 +56,7 @@ void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index } else { // Naive memcpy calls for (Index col = 0; col < cols; ++col) { - internal::fast_memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar)); + memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar)); } } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index b133781ae..ccaaa6cb2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -22,7 +22,7 @@ struct DefaultDevice { internal::aligned_free(buffer); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { - internal::fast_memcpy(dst, src, n); + ::memcpy(dst, src, n); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { memcpy(dst, src, n); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index facdea735..16180ca69 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -106,7 +106,7 @@ struct ThreadPoolDevice { } EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { - internal::fast_memcpy(dst, src, n); + ::memcpy(dst, src, n); } EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { memcpy(dst, src, n); -- cgit v1.2.3