diff options
4 files changed, 135 insertions, 31 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 35523ec73..a59a5d5b2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -167,61 +167,61 @@ struct TensorBlockCopyOp { } if (src_stride == 1) { - const Index vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize; + const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize; if (dst_stride == 1) { // LINEAR - for (Index i = 0; i < vectorized_size; i += PacketSize) { + for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) { Packet p = internal::ploadu<Packet>(src + i); internal::pstoreu<Scalar, Packet>(dst + i, p); } - for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) { dst[i] = src[i]; } } else { // SCATTER - for (Index i = 0; i < vectorized_size; i += PacketSize) { + for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) { Packet p = internal::ploadu<Packet>(src + i); internal::pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride); } - for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) { dst[i * dst_stride] = src[i]; } } } else if (src_stride == 0) { - const Index vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize; + const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize; if (dst_stride == 1) { // LINEAR - for (Index i = 0; i < vectorized_size; i += PacketSize) { + for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) { Packet p = internal::pload1<Packet>(src); internal::pstoreu<Scalar, Packet>(dst + i, p); } - for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) { dst[i] = *src; } } else { // SCATTER - for (Index i = 0; i < vectorized_size; i += PacketSize) { + for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) { Packet p = internal::pload1<Packet>(src); internal::pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride); } - for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) { dst[i * dst_stride] = *src; } } } else { if (dst_stride == 1) { // GATHER - const Index vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize; - for (Index i = 0; i < vectorized_size; i += PacketSize) { + const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize; + for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) { Packet p = internal::pgather<Scalar, Packet>(src + i * src_stride, src_stride); internal::pstoreu<Scalar, Packet>(dst + i, p); } - for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) { dst[i] = src[i * src_stride]; } } else { // RANDOM - for (Index i = 0; i < num_coeff_to_copy; ++i) { + for (StorageIndex i = 0; i < num_coeff_to_copy; ++i) { dst[i * dst_stride] = src[i * src_stride]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index b92753c44..0c2bbcaa0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -671,7 +671,17 @@ struct TensorContractionEvaluatorBase 0, k, 1); } - template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, + bool rhs_inner_dim_reordered, int Alignment> + EIGEN_DEVICE_FUNC void evalGemmPartialWithoutOutputKernel( + Scalar* buffer, Index k_start, Index k_end, int num_threads) const { + evalGemmPartial<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Alignment, + /*use_output_kernel*/ false>(buffer, k_start, k_end, + num_threads); + } + + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment, bool use_output_kernel = true> EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const { // columns in left side, rows in right side const Index k = this->m_k_size; @@ -740,7 +750,7 @@ struct TensorContractionEvaluatorBase const Index actual_mc = numext::mini(i2+mc,m)-i2; for (Index k2 = k_start; k2 < k_end; k2 += kc) { // make sure we don't overshoot right edge of left matrix, then pack vertical panel - const Index actual_kc = numext::mini(k2 + kc, k) - k2; + const Index actual_kc = numext::mini(k2 + kc, k_end) - k2; TensorContractionKernel::packLhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); @@ -759,7 +769,7 @@ struct TensorContractionEvaluatorBase Scalar(1)); // We are done with this [i2, j2] output block. - if (k2 + kc >= k) { + if (use_output_kernel && k2 + kc >= k_end) { m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2, actual_mc, actual_nc); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 4553c3785..675201d23 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -798,14 +798,15 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT auto process_block = [=, &barrier](Scalar* buf, Index first, Index last) { ::memset(buf, 0, m * n * sizeof(Scalar)); TENSOR_CONTRACTION_DISPATCH( - this->template evalGemmPartial, Alignment, + this->template evalGemmPartialWithoutOutputKernel, Alignment, (buf, first, last, this->m_device.numThreads())); barrier.Notify(); }; Index start = 0; for (int blocks_left = num_blocks; blocks_left > 0; --blocks_left) { - // The underlying GEMM kernel assumes that k is a multiple of 8 and - // subtle breakage occurs if this is violated. + // The underlying GEMM kernel assumes that k is a multiple of packet size + // (currently largest packet size is 8) and subtle breakage occurs if + // this is violated. block_size = 8 * divup<Index>(k - start, 8 * blocks_left); Scalar* buf; if (start == 0) { @@ -830,6 +831,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT addToBuffer<Alignment>(m * n, buf, result); this->m_device.deallocate(buf); } + + // Finally call output kernel with finalized output buffer. + typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; + this->m_output_kernel(OutputMapper(result, m), + this->m_tensor_contraction_params, + static_cast<Eigen::Index>(0), + static_cast<Eigen::Index>(0), + m, n); } TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const { diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index 6d8e58214..cd807659e 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -306,6 +306,86 @@ static void test_multithread_contraction_with_output_kernel() { } } +// We are triggering 'evalShardedByInnerDim' optimization. +template <int DataLayout> +static void test_sharded_by_inner_dim_contraction() +{ + typedef Tensor<float, 1>::DimensionPair DimPair; + + const int num_threads = internal::random<int>(4, 16); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads); + + Tensor<float, 2, DataLayout> t_left(2, 10000); + Tensor<float, 2, DataLayout> t_right(10000, 10); + Tensor<float, 2, DataLayout> t_result(2, 10); + + t_left.setRandom(); + t_right.setRandom(); + // Put trash in t_result to verify contraction clears output memory. + t_result.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; + MapXf m_left(t_left.data(), 2, 10000); + MapXf m_right(t_right.data(), 10000, 10); + Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}}); + + // compute results by separate methods + t_result.device(device) = t_left.contract(t_right, dims); + m_result = m_left * m_right; + + for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); + } +} + +// We are triggering 'evalShardedByInnerDim' optimization with output kernel. +template <int DataLayout> +static void test_sharded_by_inner_dim_contraction_with_output_kernel() +{ + typedef Tensor<float, 1>::DimensionPair DimPair; + + const int num_threads = internal::random<int>(4, 16); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads); + + Tensor<float, 2, DataLayout> t_left(2, 10000); + Tensor<float, 2, DataLayout> t_right(10000, 10); + Tensor<float, 2, DataLayout> t_result(2, 10); + + t_left.setRandom(); + t_right.setRandom(); + // Put trash in t_result to verify contraction clears output memory. + t_result.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; + MapXf m_left(t_left.data(), 2, 10000); + MapXf m_right(t_right.data(), 10000, 10); + Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}}); + + // compute results by separate methods + t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel()); + m_result = m_left * m_right; + + for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i])); + } +} + template<int DataLayout> void test_full_contraction() { int contract_size1 = internal::random<int>(1, 500); @@ -446,21 +526,26 @@ EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool) CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<ColMajor>()); CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<RowMajor>()); + CALL_SUBTEST_4(test_sharded_by_inner_dim_contraction<ColMajor>()); + CALL_SUBTEST_4(test_sharded_by_inner_dim_contraction<RowMajor>()); + CALL_SUBTEST_4(test_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>()); + CALL_SUBTEST_4(test_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>()); + // Exercise various cases that have been problematic in the past. - CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>()); - CALL_SUBTEST_4(test_contraction_corner_cases<RowMajor>()); + CALL_SUBTEST_5(test_contraction_corner_cases<ColMajor>()); + CALL_SUBTEST_5(test_contraction_corner_cases<RowMajor>()); - CALL_SUBTEST_4(test_full_contraction<ColMajor>()); - CALL_SUBTEST_4(test_full_contraction<RowMajor>()); + CALL_SUBTEST_6(test_full_contraction<ColMajor>()); + CALL_SUBTEST_6(test_full_contraction<RowMajor>()); - CALL_SUBTEST_5(test_multithreaded_reductions<ColMajor>()); - CALL_SUBTEST_5(test_multithreaded_reductions<RowMajor>()); + CALL_SUBTEST_7(test_multithreaded_reductions<ColMajor>()); + CALL_SUBTEST_7(test_multithreaded_reductions<RowMajor>()); - CALL_SUBTEST_6(test_memcpy()); - CALL_SUBTEST_6(test_multithread_random()); + CALL_SUBTEST_7(test_memcpy()); + CALL_SUBTEST_7(test_multithread_random()); TestAllocator test_allocator; - CALL_SUBTEST_6(test_multithread_shuffle<ColMajor>(NULL)); - CALL_SUBTEST_6(test_multithread_shuffle<RowMajor>(&test_allocator)); - CALL_SUBTEST_6(test_threadpool_allocate(&test_allocator)); + CALL_SUBTEST_7(test_multithread_shuffle<ColMajor>(NULL)); + CALL_SUBTEST_7(test_multithread_shuffle<RowMajor>(&test_allocator)); + CALL_SUBTEST_7(test_threadpool_allocate(&test_allocator)); } |