diff options
author | Eugene Zhulenev <ezhulenev@google.com> | 2018-09-28 11:24:08 -0700 |
---|---|---|
committer | Eugene Zhulenev <ezhulenev@google.com> | 2018-09-28 11:24:08 -0700 |
commit | 524c81f3fad1548a92504d92326f3622075ed77b (patch) | |
tree | 3e1fdee33c278ceef2957b324052e0dfb6dad821 /unsupported/Eigen/CXX11 | |
parent | e95696acb313a84b33a18cc300de418b05dc58e5 (diff) |
Add tests for evalShardedByInnerDim contraction + fix bugs
Diffstat (limited to 'unsupported/Eigen/CXX11')
3 files changed, 39 insertions, 20 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 35523ec73..a59a5d5b2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -167,61 +167,61 @@ struct TensorBlockCopyOp { } if (src_stride == 1) { - const Index vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize; + const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize; if (dst_stride == 1) { // LINEAR - for (Index i = 0; i < vectorized_size; i += PacketSize) { + for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) { Packet p = internal::ploadu<Packet>(src + i); internal::pstoreu<Scalar, Packet>(dst + i, p); } - for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) { dst[i] = src[i]; } } else { // SCATTER - for (Index i = 0; i < vectorized_size; i += PacketSize) { + for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) { Packet p = internal::ploadu<Packet>(src + i); internal::pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride); } - for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) { dst[i * dst_stride] = src[i]; } } } else if (src_stride == 0) { - const Index vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize; + const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize; if (dst_stride == 1) { // LINEAR - for (Index i = 0; i < vectorized_size; i += PacketSize) { + for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) { Packet p = internal::pload1<Packet>(src); internal::pstoreu<Scalar, Packet>(dst + i, p); } - for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) { dst[i] = *src; } } else { // SCATTER - for (Index i = 0; i < vectorized_size; i += PacketSize) { + for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) { Packet p = internal::pload1<Packet>(src); internal::pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride); } - for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) { dst[i * dst_stride] = *src; } } } else { if (dst_stride == 1) { // GATHER - const Index vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize; - for (Index i = 0; i < vectorized_size; i += PacketSize) { + const StorageIndex vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize; + for (StorageIndex i = 0; i < vectorized_size; i += PacketSize) { Packet p = internal::pgather<Scalar, Packet>(src + i * src_stride, src_stride); internal::pstoreu<Scalar, Packet>(dst + i, p); } - for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + for (StorageIndex i = vectorized_size; i < num_coeff_to_copy; ++i) { dst[i] = src[i * src_stride]; } } else { // RANDOM - for (Index i = 0; i < num_coeff_to_copy; ++i) { + for (StorageIndex i = 0; i < num_coeff_to_copy; ++i) { dst[i * dst_stride] = src[i * src_stride]; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index b92753c44..0c2bbcaa0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -671,7 +671,17 @@ struct TensorContractionEvaluatorBase 0, k, 1); } - template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, + bool rhs_inner_dim_reordered, int Alignment> + EIGEN_DEVICE_FUNC void evalGemmPartialWithoutOutputKernel( + Scalar* buffer, Index k_start, Index k_end, int num_threads) const { + evalGemmPartial<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Alignment, + /*use_output_kernel*/ false>(buffer, k_start, k_end, + num_threads); + } + + template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment, bool use_output_kernel = true> EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const { // columns in left side, rows in right side const Index k = this->m_k_size; @@ -740,7 +750,7 @@ struct TensorContractionEvaluatorBase const Index actual_mc = numext::mini(i2+mc,m)-i2; for (Index k2 = k_start; k2 < k_end; k2 += kc) { // make sure we don't overshoot right edge of left matrix, then pack vertical panel - const Index actual_kc = numext::mini(k2 + kc, k) - k2; + const Index actual_kc = numext::mini(k2 + kc, k_end) - k2; TensorContractionKernel::packLhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); @@ -759,7 +769,7 @@ struct TensorContractionEvaluatorBase Scalar(1)); // We are done with this [i2, j2] output block. - if (k2 + kc >= k) { + if (use_output_kernel && k2 + kc >= k_end) { m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2, actual_mc, actual_nc); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 4553c3785..675201d23 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -798,14 +798,15 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT auto process_block = [=, &barrier](Scalar* buf, Index first, Index last) { ::memset(buf, 0, m * n * sizeof(Scalar)); TENSOR_CONTRACTION_DISPATCH( - this->template evalGemmPartial, Alignment, + this->template evalGemmPartialWithoutOutputKernel, Alignment, (buf, first, last, this->m_device.numThreads())); barrier.Notify(); }; Index start = 0; for (int blocks_left = num_blocks; blocks_left > 0; --blocks_left) { - // The underlying GEMM kernel assumes that k is a multiple of 8 and - // subtle breakage occurs if this is violated. + // The underlying GEMM kernel assumes that k is a multiple of packet size + // (currently largest packet size is 8) and subtle breakage occurs if + // this is violated. block_size = 8 * divup<Index>(k - start, 8 * blocks_left); Scalar* buf; if (start == 0) { @@ -830,6 +831,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT addToBuffer<Alignment>(m * n, buf, result); this->m_device.deallocate(buf); } + + // Finally call output kernel with finalized output buffer. + typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; + this->m_output_kernel(OutputMapper(result, m), + this->m_tensor_contraction_params, + static_cast<Eigen::Index>(0), + static_cast<Eigen::Index>(0), + m, n); } TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const { |