diff options
author | Gael Guennebaud <g.gael@free.fr> | 2018-07-31 10:10:14 +0200 |
---|---|---|
committer | Gael Guennebaud <g.gael@free.fr> | 2018-07-31 10:10:14 +0200 |
commit | 679eece8760ce9b9ff09e48b6ee8673afcf94caa (patch) | |
tree | 8297adb8202fcdd1bce940937aae110aab8cfaed /unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | |
parent | 723856dec1b5d5be0e35be0612e188a30bfa594b (diff) |
Speedup trivial tensor broadcasting on GPU by enforcing unaligned loads. See PR 437.
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 11 |
1 files changed, 11 insertions, 0 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 278689915..e647b3609 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -284,7 +284,13 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { if (isCopy) { + #ifdef EIGEN_GPU_COMPILE_PHASE + // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing + // unaligned loads here. The reason is unclear though. + return m_impl.template packet<Unaligned>(index); + #else return m_impl.template packet<LoadMode>(index); + #endif } else if (oneByN && !nByOne) { return packetNByOne<LoadMode>(index); } else if (!oneByN && nByOne) { @@ -296,7 +302,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> } } else { if (isCopy) { + #ifdef EIGEN_GPU_COMPILE_PHASE + // See above. + return m_impl.template packet<Unaligned>(index); + #else return m_impl.template packet<LoadMode>(index); + #endif } else if (oneByN && !nByOne) { return packetOneByN<LoadMode>(index); } else if (!oneByN && nByOne) { |