Merged latest changes from upstream/eigen

author: Eugene Zhulenev <ezhulenev@google.com> 2018-08-01 11:59:04 -0700
committer: Eugene Zhulenev <ezhulenev@google.com> 2018-08-01 11:59:04 -0700
commit: 385b3ff12f1dd41a096908a0103873a768a8597d (patch)
tree: 3d465cbc2c37c43adbe1b01bba35bb47eb92f48b /unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
parent: 83c0a16baf5ecac6288cd9b74536a82de8985b31 (diff)
parent: 17221115c9f7e382c84c5d053f885470e904f4a4 (diff)
1 files changed, 12 insertions, 1 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index a851e7f55..b6dbe5a22 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -104,7 +104,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
   bool isCopy= false, nByOne = false, oneByN = false;
 
   enum {
@@ -306,7 +306,13 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       if (isCopy) {
+        #ifdef EIGEN_GPU_COMPILE_PHASE
+        // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
+        // unaligned loads here. The reason is unclear though.
+        return m_impl.template packet<Unaligned>(index);
+        #else
         return m_impl.template packet<LoadMode>(index);
+        #endif
       } else if (oneByN && !nByOne) {
         return packetNByOne<LoadMode>(index);
       } else if (!oneByN && nByOne) {
@@ -318,7 +324,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       }
     } else {
       if (isCopy) {
+        #ifdef EIGEN_GPU_COMPILE_PHASE
+        // See above.
+        return m_impl.template packet<Unaligned>(index);
+        #else
         return m_impl.template packet<LoadMode>(index);
+        #endif
       } else if (oneByN && !nByOne) {
         return packetOneByN<LoadMode>(index);
       } else if (!oneByN && nByOne) {
author	Eugene Zhulenev <ezhulenev@google.com>	2018-08-01 11:59:04 -0700
committer	Eugene Zhulenev <ezhulenev@google.com>	2018-08-01 11:59:04 -0700
commit	385b3ff12f1dd41a096908a0103873a768a8597d (patch)
tree	3d465cbc2c37c43adbe1b01bba35bb47eb92f48b /unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
parent	83c0a16baf5ecac6288cd9b74536a82de8985b31 (diff)
parent	17221115c9f7e382c84c5d053f885470e904f4a4 (diff)