From e95696acb313a84b33a18cc300de418b05dc58e5 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Thu, 27 Sep 2018 14:49:26 -0700 Subject: Optimize TensorBlockCopyOp --- unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h | 86 +++++++++++++++++++++--- 1 file changed, 75 insertions(+), 11 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 558130300..35523ec73 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -144,24 +144,88 @@ class TensorBlock { template struct TensorBlockCopyOp { + + typedef typename packet_traits::type Packet; + enum { + Vectorizable = internal::packet_traits::Vectorizable, + PacketSize = internal::packet_traits::size + }; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( const StorageIndex num_coeff_to_copy, const StorageIndex dst_index, const StorageIndex dst_stride, Scalar* EIGEN_RESTRICT dst_data, const StorageIndex src_index, const StorageIndex src_stride, const Scalar* EIGEN_RESTRICT src_data) { - const Scalar* src_base = &src_data[src_index]; - Scalar* dst_base = &dst_data[dst_index]; - - typedef const Array Src; - typedef Array Dst; + const Scalar* src = &src_data[src_index]; + Scalar* dst = &dst_data[dst_index]; - typedef Map > SrcMap; - typedef Map > DstMap; - - const SrcMap src(src_base, num_coeff_to_copy, InnerStride<>(src_stride)); - DstMap dst(dst_base, num_coeff_to_copy, InnerStride<>(dst_stride)); + if (!Vectorizable) { + for (Index i = 0; i < num_coeff_to_copy; ++i) { + dst[i * dst_stride] = src[i * src_stride]; + } + return; + } - dst = src; + if (src_stride == 1) { + const Index vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize; + if (dst_stride == 1) { + // LINEAR + for (Index i = 0; i < vectorized_size; i += PacketSize) { + Packet p = internal::ploadu(src + i); + internal::pstoreu(dst + i, p); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst[i] = src[i]; + } + } else { + // SCATTER + for (Index i = 0; i < vectorized_size; i += PacketSize) { + Packet p = internal::ploadu(src + i); + internal::pscatter(dst + i * dst_stride, p, dst_stride); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst[i * dst_stride] = src[i]; + } + } + } else if (src_stride == 0) { + const Index vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize; + if (dst_stride == 1) { + // LINEAR + for (Index i = 0; i < vectorized_size; i += PacketSize) { + Packet p = internal::pload1(src); + internal::pstoreu(dst + i, p); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst[i] = *src; + } + } else { + // SCATTER + for (Index i = 0; i < vectorized_size; i += PacketSize) { + Packet p = internal::pload1(src); + internal::pscatter(dst + i * dst_stride, p, dst_stride); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst[i * dst_stride] = *src; + } + } + } else { + if (dst_stride == 1) { + // GATHER + const Index vectorized_size = (num_coeff_to_copy / PacketSize) * PacketSize; + for (Index i = 0; i < vectorized_size; i += PacketSize) { + Packet p = internal::pgather(src + i * src_stride, src_stride); + internal::pstoreu(dst + i, p); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst[i] = src[i * src_stride]; + } + } else { + // RANDOM + for (Index i = 0; i < num_coeff_to_copy; ++i) { + dst[i * dst_stride] = src[i * src_stride]; + } + } + } } }; -- cgit v1.2.3