aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
diff options
context:
space:
mode:
authorGravatar Eugene Zhulenev <ezhulenev@google.com>2018-08-01 11:59:04 -0700
committerGravatar Eugene Zhulenev <ezhulenev@google.com>2018-08-01 11:59:04 -0700
commit385b3ff12f1dd41a096908a0103873a768a8597d (patch)
tree3d465cbc2c37c43adbe1b01bba35bb47eb92f48b /unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
parent83c0a16baf5ecac6288cd9b74536a82de8985b31 (diff)
parent17221115c9f7e382c84c5d053f885470e904f4a4 (diff)
Merged latest changes from upstream/eigen
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h13
1 files changed, 12 insertions, 1 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index a851e7f55..b6dbe5a22 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -104,7 +104,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
- static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
bool isCopy= false, nByOne = false, oneByN = false;
enum {
@@ -306,7 +306,13 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
if (isCopy) {
+ #ifdef EIGEN_GPU_COMPILE_PHASE
+ // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
+ // unaligned loads here. The reason is unclear though.
+ return m_impl.template packet<Unaligned>(index);
+ #else
return m_impl.template packet<LoadMode>(index);
+ #endif
} else if (oneByN && !nByOne) {
return packetNByOne<LoadMode>(index);
} else if (!oneByN && nByOne) {
@@ -318,7 +324,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
}
} else {
if (isCopy) {
+ #ifdef EIGEN_GPU_COMPILE_PHASE
+ // See above.
+ return m_impl.template packet<Unaligned>(index);
+ #else
return m_impl.template packet<LoadMode>(index);
+ #endif
} else if (oneByN && !nByOne) {
return packetOneByN<LoadMode>(index);
} else if (!oneByN && nByOne) {