diff options
author | Eugene Zhulenev <ezhulenev@google.com> | 2018-08-01 11:59:04 -0700 |
---|---|---|
committer | Eugene Zhulenev <ezhulenev@google.com> | 2018-08-01 11:59:04 -0700 |
commit | 385b3ff12f1dd41a096908a0103873a768a8597d (patch) | |
tree | 3d465cbc2c37c43adbe1b01bba35bb47eb92f48b /unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | |
parent | 83c0a16baf5ecac6288cd9b74536a82de8985b31 (diff) | |
parent | 17221115c9f7e382c84c5d053f885470e904f4a4 (diff) |
Merged latest changes from upstream/eigen
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 13 |
1 files changed, 12 insertions, 1 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index a851e7f55..b6dbe5a22 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -104,7 +104,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + static const int PacketSize = PacketType<CoeffReturnType, Device>::size; bool isCopy= false, nByOne = false, oneByN = false; enum { @@ -306,7 +306,13 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { if (isCopy) { + #ifdef EIGEN_GPU_COMPILE_PHASE + // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing + // unaligned loads here. The reason is unclear though. + return m_impl.template packet<Unaligned>(index); + #else return m_impl.template packet<LoadMode>(index); + #endif } else if (oneByN && !nByOne) { return packetNByOne<LoadMode>(index); } else if (!oneByN && nByOne) { @@ -318,7 +324,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> } } else { if (isCopy) { + #ifdef EIGEN_GPU_COMPILE_PHASE + // See above. + return m_impl.template packet<Unaligned>(index); + #else return m_impl.template packet<LoadMode>(index); + #endif } else if (oneByN && !nByOne) { return packetOneByN<LoadMode>(index); } else if (!oneByN && nByOne) { |