From 6293ad3f392a7b97ebb9f9f874682505c1391f2d Mon Sep 17 00:00:00 2001 From: Vamsi Sripathi Date: Wed, 23 May 2018 14:02:05 -0700 Subject: Performance improvements to tensor broadcast operation 1. Added new packet functions using SIMD for NByOne, OneByN cases 2. Modified existing packet functions to reduce index calculations when input stride is non-SIMD 3. Added 4 test cases to cover the new packet functions --- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 110 ++++++++++++++++++++- 1 file changed, 106 insertions(+), 4 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index b6c93aff9..9ab6b3565 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -105,6 +105,7 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = internal::unpacket_traits::size; + bool nByOne = false, oneByN = false; enum { IsAligned = true, @@ -142,6 +143,24 @@ struct TensorEvaluator, Device> m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; } } + + if (input_dims[0] == 1) { + oneByN = true; + for (int i = 1; i < NumDims; ++i) { + if (broadcast[i] != 1) { + oneByN = false; + break; + } + } + } else if (input_dims[NumDims-1] == 1) { + nByOne = true; + for (int i = 0; i < NumDims-1; ++i) { + if (broadcast[i] != 1) { + nByOne = false; + break; + } + } + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -237,9 +256,84 @@ struct TensorEvaluator, Device> } if (static_cast(Layout) == static_cast(ColMajor)) { - return packetColMajor(index); + if (oneByN) { + return packetNByOne(index); + } else if (nByOne) { + return packetOneByN(index); + } else { + return packetColMajor(index); + } } else { - return packetRowMajor(index); + if (oneByN) { + return packetOneByN(index); + } else if (nByOne) { + return packetNByOne(index); + } else { + return packetRowMajor(index); + } + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + Index dim, inputIndex; + + if (static_cast(Layout) == static_cast(ColMajor)) { + dim = NumDims - 1; + } else { + dim = 0; + } + + inputIndex = index % m_inputStrides[dim]; + if (inputIndex + PacketSize <= m_inputStrides[dim]) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + if (inputIndex > m_inputStrides[dim]-1) { + inputIndex = 0; + } + values[i] = m_impl.coeff(inputIndex++); + } + return internal::pload(values); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + Index dim, inputIndex, outputOffset; + + if (static_cast(Layout) == static_cast(ColMajor)) { + dim = 1; + } else { + dim = NumDims - 2; + } + + inputIndex = index / m_outputStrides[dim]; + outputOffset = index % m_outputStrides[dim]; + if (outputOffset + PacketSize <= m_outputStrides[dim]) { + values[0] = m_impl.coeff(inputIndex); + return internal::pload1(values); + } else { + for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { + if (outputOffset + cur < m_outputStrides[dim]) { + values[i] = m_impl.coeff(inputIndex); + } else { + values[i] = m_impl.coeff(++inputIndex); + outputOffset = 0; + cur = 0; + } + } + return internal::pload(values); } } @@ -290,7 +384,11 @@ struct TensorEvaluator, Device> EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); for (int i = 1; i < PacketSize; ++i) { - values[i] = coeffColMajor(originalIndex+i); + if (innermostLoc + i < m_impl.dimensions()[0]) { + values[i] = m_impl.coeff(inputIndex+i); + } else { + values[i] = coeffColMajor(originalIndex+i); + } } PacketReturnType rslt = internal::pload(values); return rslt; @@ -342,7 +440,11 @@ struct TensorEvaluator, Device> EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); for (int i = 1; i < PacketSize; ++i) { - values[i] = coeffRowMajor(originalIndex+i); + if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) { + values[i] = m_impl.coeff(inputIndex+i); + } else { + values[i] = coeffRowMajor(originalIndex+i); + } } PacketReturnType rslt = internal::pload(values); return rslt; -- cgit v1.2.3