Speedup trivial tensor broadcasting on GPU by enforcing unaligned loads. See PR 437.

author: Gael Guennebaud <g.gael@free.fr> 2018-07-31 10:10:14 +0200
committer: Gael Guennebaud <g.gael@free.fr> 2018-07-31 10:10:14 +0200
commit: 679eece8760ce9b9ff09e48b6ee8673afcf94caa (patch)
tree: 8297adb8202fcdd1bce940937aae110aab8cfaed /unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
parent: 723856dec1b5d5be0e35be0612e188a30bfa594b (diff)
1 files changed, 11 insertions, 0 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 278689915..e647b3609 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -284,7 +284,13 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       if (isCopy) {
+        #ifdef EIGEN_GPU_COMPILE_PHASE
+        // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
+        // unaligned loads here. The reason is unclear though.
+        return m_impl.template packet<Unaligned>(index);
+        #else
         return m_impl.template packet<LoadMode>(index);
+        #endif
       } else if (oneByN && !nByOne) {
         return packetNByOne<LoadMode>(index);
       } else if (!oneByN && nByOne) {
@@ -296,7 +302,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       }
     } else {
       if (isCopy) {
+        #ifdef EIGEN_GPU_COMPILE_PHASE
+        // See above.
+        return m_impl.template packet<Unaligned>(index);
+        #else
         return m_impl.template packet<LoadMode>(index);
+        #endif
       } else if (oneByN && !nByOne) {
         return packetOneByN<LoadMode>(index);
       } else if (!oneByN && nByOne) {
author	Gael Guennebaud <g.gael@free.fr>	2018-07-31 10:10:14 +0200
committer	Gael Guennebaud <g.gael@free.fr>	2018-07-31 10:10:14 +0200
commit	679eece8760ce9b9ff09e48b6ee8673afcf94caa (patch)
tree	8297adb8202fcdd1bce940937aae110aab8cfaed /unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
parent	723856dec1b5d5be0e35be0612e188a30bfa594b (diff)