diff options
-rw-r--r-- | Eigen/src/Core/SolveTriangular.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralBlockPanelKernel.h | 124 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixMatrix.h | 16 | ||||
-rw-r--r-- | Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/products/Parallelizer.h | 4 | ||||
-rw-r--r-- | Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 6 | ||||
-rw-r--r-- | Eigen/src/Core/products/TriangularMatrixMatrix.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/products/TriangularSolverMatrix.h | 4 | ||||
-rw-r--r-- | blas/level3_impl.h | 12 | ||||
-rw-r--r-- | test/product_large.cpp | 7 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/Tensor | 2 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h | 13 |
13 files changed, 117 insertions, 79 deletions
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h index ef17f288e..e158e3162 100644 --- a/Eigen/src/Core/SolveTriangular.h +++ b/Eigen/src/Core/SolveTriangular.h @@ -96,7 +96,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic> typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType; - BlockingType blocking(rhs.rows(), rhs.cols(), size); + BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false); triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor, (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor> diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 090c8f4e6..b91786037 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -26,28 +26,37 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff } /** \internal */ -inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) +inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) { - static std::ptrdiff_t m_l1CacheSize = 0; - static std::ptrdiff_t m_l2CacheSize = 0; - if(m_l2CacheSize==0) + static bool m_cache_sizes_initialized = false; + static std::ptrdiff_t m_l1CacheSize = 32*1024; + static std::ptrdiff_t m_l2CacheSize = 256*1024; + static std::ptrdiff_t m_l3CacheSize = 2*1024*1024; + + if(!m_cache_sizes_initialized) { - m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024); - m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024); + int l1CacheSize, l2CacheSize, l3CacheSize; + queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize); + m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, 8*1024); + m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, 256*1024); + m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, 8*1024*1024); + m_cache_sizes_initialized = true; } - + if(action==SetAction) { // set the cpu cache size and cache all block sizes from a global cache size in byte eigen_internal_assert(l1!=0 && l2!=0); m_l1CacheSize = *l1; m_l2CacheSize = *l2; + m_l3CacheSize = *l3; } else if(action==GetAction) { eigen_internal_assert(l1!=0 && l2!=0); *l1 = m_l1CacheSize; *l2 = m_l2CacheSize; + *l3 = m_l3CacheSize; } else { @@ -70,10 +79,11 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi * - the number of scalars that fit into a packet (when vectorization is enabled). * * \sa setCpuCacheSizes */ +#define CEIL(a, b) ((a)+(b)-1)/(b) + template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType> -void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) +void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads) { - EIGEN_UNUSED_VARIABLE(n); // Explanations: // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed @@ -81,43 +91,71 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) // at the register level. For vectorization purpose, these small vertical panels are unpacked, // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to // stay in L1 cache. - std::ptrdiff_t l1, l2; - - typedef gebp_traits<LhsScalar,RhsScalar> Traits; - enum { - kdiv = KcFactor * 2 * Traits::nr - * Traits::RhsProgress * sizeof(RhsScalar), - mr = gebp_traits<LhsScalar,RhsScalar>::mr, - mr_mask = (0xffffffff/mr)*mr - }; + std::ptrdiff_t l1, l2, l3; + manage_caching_sizes(GetAction, &l1, &l2, &l3); + + if (num_threads > 1) { + typedef gebp_traits<LhsScalar,RhsScalar> Traits; + typedef typename Traits::ResScalar ResScalar; + enum { + kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), + ksub = Traits::mr * Traits::nr * sizeof(ResScalar), + k_mask = (0xffffffff/8)*8, + + mr = Traits::mr, + mr_mask = (0xffffffff/mr)*mr, + + nr = Traits::nr, + nr_mask = (0xffffffff/nr)*nr + }; + SizeType k_cache = (l1-ksub)/kdiv; + if (k_cache < k) { + k = k_cache & k_mask; + eigen_assert(k > 0); + } - manage_caching_sizes(GetAction, &l1, &l2); + SizeType n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); + SizeType n_per_thread = CEIL(n, num_threads); + if (n_cache <= n_per_thread) { + // Don't exceed the capacity of the l2 cache. + eigen_assert(n_cache >= static_cast<SizeType>(nr)); + n = n_cache & nr_mask; + eigen_assert(n > 0); + } else { + n = (std::min<SizeType>)(n, (n_per_thread + nr - 1) & nr_mask); + } -// k = std::min<SizeType>(k, l1/kdiv); -// SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; -// if(_m<m) m = _m & mr_mask; - - // In unit tests we do not want to use extra large matrices, - // so we reduce the block size to check the blocking strategy is not flawed + if (l3 > l2) { + // l3 is shared between all cores, so we'll give each thread its own chunk of l3. + SizeType m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); + SizeType m_per_thread = CEIL(m, num_threads); + if(m_cache < m_per_thread && m_cache >= static_cast<SizeType>(mr)) { + m = m_cache & mr_mask; + eigen_assert(m > 0); + } else { + m = (std::min<SizeType>)(m, (m_per_thread + mr - 1) & mr_mask); + } + } + } + else { + // In unit tests we do not want to use extra large matrices, + // so we reduce the block size to check the blocking strategy is not flawed #ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS -// k = std::min<SizeType>(k,240); -// n = std::min<SizeType>(n,3840/sizeof(RhsScalar)); -// m = std::min<SizeType>(m,3840/sizeof(RhsScalar)); - - k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240); - n = std::min<SizeType>(n,3840/sizeof(RhsScalar)); - m = std::min<SizeType>(m,3840/sizeof(RhsScalar)); + k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240); + n = std::min<SizeType>(n,3840/sizeof(RhsScalar)); + m = std::min<SizeType>(m,3840/sizeof(RhsScalar)); #else - k = std::min<SizeType>(k,24); - n = std::min<SizeType>(n,384/sizeof(RhsScalar)); - m = std::min<SizeType>(m,384/sizeof(RhsScalar)); + k = std::min<SizeType>(k,24); + n = std::min<SizeType>(n,384/sizeof(RhsScalar)); + m = std::min<SizeType>(m,384/sizeof(RhsScalar)); #endif + } } template<typename LhsScalar, typename RhsScalar, typename SizeType> -inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) +inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads) { - computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n); + computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads); } #ifdef EIGEN_HAS_FUSE_CJMADD @@ -1846,8 +1884,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Co * \sa setCpuCacheSize */ inline std::ptrdiff_t l1CacheSize() { - std::ptrdiff_t l1, l2; - internal::manage_caching_sizes(GetAction, &l1, &l2); + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); return l1; } @@ -1855,8 +1893,8 @@ inline std::ptrdiff_t l1CacheSize() * \sa setCpuCacheSize */ inline std::ptrdiff_t l2CacheSize() { - std::ptrdiff_t l1, l2; - internal::manage_caching_sizes(GetAction, &l1, &l2); + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); return l2; } @@ -1865,9 +1903,9 @@ inline std::ptrdiff_t l2CacheSize() * for the algorithms working per blocks. * * \sa computeProductBlockingSizes */ -inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2) +inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3) { - internal::manage_caching_sizes(SetAction, &l1, &l2); + internal::manage_caching_sizes(SetAction, &l1, &l2, &l3); } } // end namespace Eigen diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 4d7a6270a..49362adbe 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -299,7 +299,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M public: - gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, bool /*full_rows*/ = false) + gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, int /*num_threads*/, bool /*full_rows = false*/) { this->m_mc = ActualRows; this->m_nc = ActualCols; @@ -331,21 +331,21 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M public: - gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, bool full_rows = false) + gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, int num_threads, bool l3_blocking) { this->m_mc = Transpose ? cols : rows; this->m_nc = Transpose ? rows : cols; this->m_kc = depth; - if(full_rows) + if(l3_blocking) { - DenseIndex m = this->m_mc; - computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc); + computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads); } - else // full columns + else // no l3 blocking { + DenseIndex m = this->m_mc; DenseIndex n = this->m_nc; - computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n); + computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, n, num_threads); } m_sizeA = this->m_mc * this->m_kc; @@ -451,7 +451,7 @@ class GeneralProduct<Lhs, Rhs, GemmProduct> (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor; - BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true); + BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit); } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index daa8a1d8a..8de39f76f 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -72,7 +72,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder, Index kc = depth; // cache block size along the K direction Index mc = size; // cache block size along the M direction Index nc = size; // cache block size along the N direction - computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc); + computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc, 1); // !!! mc must be a multiple of nr: if(mc > Traits::nr) mc = (mc/Traits::nr)*Traits::nr; diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 4079063eb..837e69415 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -49,8 +49,8 @@ inline void initParallel() { int nbt; internal::manage_multi_threading(GetAction, &nbt); - std::ptrdiff_t l1, l2; - internal::manage_caching_sizes(GetAction, &l1, &l2); + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); } /** \returns the max number of threads reserved for Eigen diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index d9e6084c3..21f8175d2 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -343,7 +343,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t Index kc = size; // cache block size along the K direction Index mc = rows; // cache block size along the M direction Index nc = cols; // cache block size along the N direction - computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); + computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1); // kc must smaller than mc kc = (std::min)(kc,mc); @@ -432,10 +432,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f LhsMapper lhs(_lhs,lhsStride); ResMapper res(_res,resStride); - Index kc = size; // cache block size along the K direction + Index kc = size; // cache block size along the K direction Index mc = rows; // cache block size along the M direction Index nc = cols; // cache block size along the N direction - computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); + computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1); std::size_t sizeB = kc*cols; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 77aa3e5ee..4cbb79da0 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -412,7 +412,7 @@ struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false> Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows())) : ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols())); - BlockingType blocking(stripedRows, stripedCols, stripedDepth); + BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false); internal::product_triangular_matrix_matrix<Scalar, Index, Mode, LhsIsTriangular, diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index 238d6fc02..f5de67c59 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -81,8 +81,8 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju // the goal here is to subdivise the Rhs panels such that we keep some cache // coherence when accessing the rhs elements - std::ptrdiff_t l1, l2; - manage_caching_sizes(GetAction, &l1, &l2); + std::ptrdiff_t l1, l2, l3; + manage_caching_sizes(GetAction, &l1, &l2, &l3); Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0; subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr); diff --git a/blas/level3_impl.h b/blas/level3_impl.h index a05872666..37a803ced 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -56,7 +56,7 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal else matrix(c, *m, *n, *ldc) *= beta; } - internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,*k,true); + internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,*k,1,true); int code = OP(*opa) | (OP(*opb) << 2); func[code](*m, *n, *k, a, *lda, b, *ldb, c, *ldc, alpha, blocking, 0); @@ -131,12 +131,12 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m, if(SIDE(*side)==LEFT) { - internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m); + internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false); func[code](*m, *n, a, *lda, b, *ldb, blocking); } else { - internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n); + internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n,1,false); func[code](*n, *m, a, *lda, b, *ldb, blocking); } @@ -222,12 +222,12 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m, if(SIDE(*side)==LEFT) { - internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m); + internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false); func[code](*m, *n, *m, a, *lda, tmp.data(), tmp.outerStride(), b, *ldb, alpha, blocking); } else { - internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n); + internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n,1,false); func[code](*m, *n, *n, tmp.data(), tmp.outerStride(), a, *lda, b, *ldb, alpha, blocking); } return 1; @@ -577,7 +577,7 @@ int EIGEN_BLAS_FUNC(her2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal else if(*n<0) info = 3; else if(*k<0) info = 4; else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k)) info = 7; - else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k)) info = 9; + else if(*ldb<std::max(1,(OP(*op)==NOTR)?*n:*k)) info = 9; else if(*ldc<std::max(1,*n)) info = 12; if(info) return xerbla_(SCALAR_SUFFIX_UP"HER2K",&info,6); diff --git a/test/product_large.cpp b/test/product_large.cpp index 11531aa1d..ffb8b7bf2 100644 --- a/test/product_large.cpp +++ b/test/product_large.cpp @@ -39,15 +39,16 @@ void test_product_large() // check the functions to setup blocking sizes compile and do not segfault // FIXME check they do what they are supposed to do !! std::ptrdiff_t l1 = internal::random<int>(10000,20000); - std::ptrdiff_t l2 = internal::random<int>(1000000,2000000); - setCpuCacheSizes(l1,l2); + std::ptrdiff_t l2 = internal::random<int>(100000,200000); + std::ptrdiff_t l3 = internal::random<int>(1000000,2000000); + setCpuCacheSizes(l1,l2,l3); VERIFY(l1==l1CacheSize()); VERIFY(l2==l2CacheSize()); std::ptrdiff_t k1 = internal::random<int>(10,100)*16; std::ptrdiff_t m1 = internal::random<int>(10,100)*16; std::ptrdiff_t n1 = internal::random<int>(10,100)*16; // only makes sure it compiles fine - internal::computeProductBlockingSizes<float,float>(k1,m1,n1); + internal::computeProductBlockingSizes<float,float>(k1,m1,n1,1); } { diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 7ec60044e..47447f446 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -55,7 +55,7 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" -//#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 1e6f276e0..cd992daab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -766,7 +766,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT typedef typename internal::gemm_blocking_space<ColMajor, LhsScalar, RhsScalar, Dynamic, Dynamic, Dynamic> BlockingType; // Sizes of the blocks to load in cache. See the Goto paper for details. - BlockingType blocking(m, n, k, true); + BlockingType blocking(m, n, k, 1, true); const Index kc = blocking.kc(); const Index mc = (std::min)(m, blocking.mc()); const Index nc = (std::min)(n, blocking.nc()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index dc0513305..8e4c7c11d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -152,7 +152,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT Index mc = m; Index nc = n; Index kc = k; - internal::computeProductBlockingSizes<LhsScalar,RhsScalar,1>(kc, mc, nc/*, num_threads*/); + internal::computeProductBlockingSizes<LhsScalar,RhsScalar,1>(kc, mc, nc, num_threads); eigen_assert(mc <= m); eigen_assert(nc <= n); eigen_assert(kc <= k); @@ -197,9 +197,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT // this should really be numBlockAs * n_blocks; const Index num_kernel_promises = num_threads * n_blocks; - Promise p; - p.set_value(); - std::vector<Promise> kernel_promises(num_kernel_promises, p); + std::vector<Promise> kernel_promises(num_kernel_promises); + for (int i = 0; i < kernel_promises.size(); ++i) { + kernel_promises[i].set_value(); + } for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { const Index k_start = k_block_idx * kc; @@ -275,8 +276,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT need_to_pack, // need_to_pack }; - typedef decltype(Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>) Func; - this->m_device.enqueueNoFuture<Func, packRKArg>(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg); + this->m_device.enqueueNoFuture(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg); } } } @@ -338,7 +338,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT actual_mc, arg.kc, arg.nc, 1.0, -1, -1, 0, 0); const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx; - eigen_assert(!(*arg.kernel_promises)[set_idx].ready()); (*arg.kernel_promises)[set_idx].set_value(); } } |