aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
diff options
context:
space:
mode:
authorGravatar Vamsi Sripathi <vamsi.sripathi@intel.com>2018-05-23 14:02:05 -0700
committerGravatar Vamsi Sripathi <vamsi.sripathi@intel.com>2018-05-23 14:02:05 -0700
commit6293ad3f392a7b97ebb9f9f874682505c1391f2d (patch)
tree27e8306ff9ee047f8fce7bcd853fca06294b9783 /unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
parent7134fa7a2eea469f35ea12899e693a633b5b42e5 (diff)
Performance improvements to tensor broadcast operation
1. Added new packet functions using SIMD for NByOne, OneByN cases 2. Modified existing packet functions to reduce index calculations when input stride is non-SIMD 3. Added 4 test cases to cover the new packet functions
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h110
1 files changed, 106 insertions, 4 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index b6c93aff9..9ab6b3565 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -105,6 +105,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+ bool nByOne = false, oneByN = false;
enum {
IsAligned = true,
@@ -142,6 +143,24 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
}
}
+
+ if (input_dims[0] == 1) {
+ oneByN = true;
+ for (int i = 1; i < NumDims; ++i) {
+ if (broadcast[i] != 1) {
+ oneByN = false;
+ break;
+ }
+ }
+ } else if (input_dims[NumDims-1] == 1) {
+ nByOne = true;
+ for (int i = 0; i < NumDims-1; ++i) {
+ if (broadcast[i] != 1) {
+ nByOne = false;
+ break;
+ }
+ }
+ }
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -237,9 +256,84 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
}
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
- return packetColMajor<LoadMode>(index);
+ if (oneByN) {
+ return packetNByOne<LoadMode>(index);
+ } else if (nByOne) {
+ return packetOneByN<LoadMode>(index);
+ } else {
+ return packetColMajor<LoadMode>(index);
+ }
} else {
- return packetRowMajor<LoadMode>(index);
+ if (oneByN) {
+ return packetOneByN<LoadMode>(index);
+ } else if (nByOne) {
+ return packetNByOne<LoadMode>(index);
+ } else {
+ return packetRowMajor<LoadMode>(index);
+ }
+ }
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+ Index dim, inputIndex;
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ dim = NumDims - 1;
+ } else {
+ dim = 0;
+ }
+
+ inputIndex = index % m_inputStrides[dim];
+ if (inputIndex + PacketSize <= m_inputStrides[dim]) {
+ return m_impl.template packet<Unaligned>(inputIndex);
+ } else {
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ for (int i = 0; i < PacketSize; ++i) {
+ if (inputIndex > m_inputStrides[dim]-1) {
+ inputIndex = 0;
+ }
+ values[i] = m_impl.coeff(inputIndex++);
+ }
+ return internal::pload<PacketReturnType>(values);
+ }
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ Index dim, inputIndex, outputOffset;
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ dim = 1;
+ } else {
+ dim = NumDims - 2;
+ }
+
+ inputIndex = index / m_outputStrides[dim];
+ outputOffset = index % m_outputStrides[dim];
+ if (outputOffset + PacketSize <= m_outputStrides[dim]) {
+ values[0] = m_impl.coeff(inputIndex);
+ return internal::pload1<PacketReturnType>(values);
+ } else {
+ for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
+ if (outputOffset + cur < m_outputStrides[dim]) {
+ values[i] = m_impl.coeff(inputIndex);
+ } else {
+ values[i] = m_impl.coeff(++inputIndex);
+ outputOffset = 0;
+ cur = 0;
+ }
+ }
+ return internal::pload<PacketReturnType>(values);
}
}
@@ -290,7 +384,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
values[0] = m_impl.coeff(inputIndex);
for (int i = 1; i < PacketSize; ++i) {
- values[i] = coeffColMajor(originalIndex+i);
+ if (innermostLoc + i < m_impl.dimensions()[0]) {
+ values[i] = m_impl.coeff(inputIndex+i);
+ } else {
+ values[i] = coeffColMajor(originalIndex+i);
+ }
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
@@ -342,7 +440,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
values[0] = m_impl.coeff(inputIndex);
for (int i = 1; i < PacketSize; ++i) {
- values[i] = coeffRowMajor(originalIndex+i);
+ if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) {
+ values[i] = m_impl.coeff(inputIndex+i);
+ } else {
+ values[i] = coeffRowMajor(originalIndex+i);
+ }
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;