From d9084ac8e142697f0d767092a17ffc3a7a18a2e4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 14 Apr 2017 11:05:13 +0200 Subject: Improve mixing of complex and real in the vectorized path of apply_rotation_in_the_plane --- Eigen/src/Jacobi/Jacobi.h | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) (limited to 'Eigen/src/Jacobi') diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index d25af8e90..c30326e1d 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -302,8 +302,12 @@ template void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j) { typedef typename VectorX::Scalar Scalar; - enum { PacketSize = packet_traits::size }; + enum { + PacketSize = packet_traits::size, + OtherPacketSize = packet_traits::size + }; typedef typename packet_traits::type Packet; + typedef typename packet_traits::type OtherPacket; eigen_assert(xpr_x.size() == xpr_y.size()); Index size = xpr_x.size(); Index incrx = xpr_x.derived().innerStride(); @@ -321,6 +325,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x if(VectorX::SizeAtCompileTime == Dynamic && (VectorX::Flags & VectorY::Flags & PacketAccessBit) && + (PacketSize == OtherPacketSize) && ((incrx==1 && incry==1) || PacketSize == 1)) { // both vectors are sequentially stored in memory => vectorization @@ -329,9 +334,10 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x Index alignedStart = internal::first_default_aligned(y, size); Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize; - const Packet pc = pset1(c); - const Packet ps = pset1(s); - conj_helper::IsComplex,false> pcj; + const OtherPacket pc = pset1(c); + const OtherPacket ps = pset1(s); + conj_helper::IsComplex,false> pcj; + conj_helper pm; for(Index i=0; i& xpr_x { Packet xi = pload(px); Packet yi = pload(py); - pstore(px, padd(pmul(pc,xi),pcj.pmul(ps,yi))); - pstore(py, psub(pcj.pmul(pc,yi),pmul(ps,xi))); + pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); px += PacketSize; py += PacketSize; } @@ -365,10 +371,10 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x Packet xi1 = ploadu(px+PacketSize); Packet yi = pload (py); Packet yi1 = pload (py+PacketSize); - pstoreu(px, padd(pmul(pc,xi),pcj.pmul(ps,yi))); - pstoreu(px+PacketSize, padd(pmul(pc,xi1),pcj.pmul(ps,yi1))); - pstore (py, psub(pcj.pmul(pc,yi),pmul(ps,xi))); - pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pmul(ps,xi1))); + pstoreu(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstoreu(px+PacketSize, padd(pm.pmul(pc,xi1),pcj.pmul(ps,yi1))); + pstore (py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pm.pmul(ps,xi1))); px += Peeling*PacketSize; py += Peeling*PacketSize; } @@ -376,8 +382,8 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x { Packet xi = ploadu(x+peelingEnd); Packet yi = pload (y+peelingEnd); - pstoreu(x+peelingEnd, padd(pmul(pc,xi),pcj.pmul(ps,yi))); - pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pmul(ps,xi))); + pstoreu(x+peelingEnd, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); } } @@ -393,19 +399,21 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x /*** fixed-size vectorized path ***/ else if(VectorX::SizeAtCompileTime != Dynamic && (VectorX::Flags & VectorY::Flags & PacketAccessBit) && + (PacketSize == OtherPacketSize) && (EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, evaluator::Alignment)>0)) // FIXME should be compared to the required alignment { - const Packet pc = pset1(c); - const Packet ps = pset1(s); - conj_helper::IsComplex,false> pcj; + const OtherPacket pc = pset1(c); + const OtherPacket ps = pset1(s); + conj_helper::IsComplex,false> pcj; + conj_helper pm; Scalar* EIGEN_RESTRICT px = x; Scalar* EIGEN_RESTRICT py = y; for(Index i=0; i(px); Packet yi = pload(py); - pstore(px, padd(pmul(pc,xi),pcj.pmul(ps,yi))); - pstore(py, psub(pcj.pmul(pc,yi),pmul(ps,xi))); + pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); px += PacketSize; py += PacketSize; } -- cgit v1.2.3