aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/Assign.h
diff options
context:
space:
mode:
authorGravatar Gael Guennebaud <g.gael@free.fr>2008-08-09 18:41:24 +0000
committerGravatar Gael Guennebaud <g.gael@free.fr>2008-08-09 18:41:24 +0000
commit4fa40367e9bf55ea8b2ad1040b3fb73f94e4f12f (patch)
tree3ca6d7cff691daf2d6bc8d6b1ecb00971f9debf3 /Eigen/src/Core/Assign.h
parentbecbeda50ac17288dba0a93c6adc67b663d32a7a (diff)
* Big change in Block and Map:
- added a MapBase base xpr on top of which Map and the specialization of Block are implemented - MapBase forces both aligned loads (and aligned stores, see below) in expressions such as "x.block(...) += other_expr" * Significant vectorization improvement: - added a AlignedBit flag meaning the first coeff/packet is aligned, this allows to not generate extra code to deal with the first unaligned part - removed all unaligned stores when no unrolling - removed unaligned loads in Sum when the input as the DirectAccessBit flag * Some code simplification in CacheFriendly product * Some minor documentation improvements
Diffstat (limited to 'Eigen/src/Core/Assign.h')
-rw-r--r--Eigen/src/Core/Assign.h59
1 files changed, 44 insertions, 15 deletions
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index d744a15a4..5ea59e3db 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -34,6 +34,13 @@
template <typename Derived, typename OtherDerived>
struct ei_assign_traits
{
+public:
+ enum {
+ DstIsAligned = Derived::Flags & AlignedBit,
+ SrcIsAligned = OtherDerived::Flags & AlignedBit,
+ SrcAlignment = DstIsAligned && SrcIsAligned ? Aligned : Unaligned
+ };
+
private:
enum {
InnerSize = int(Derived::Flags)&RowMajorBit
@@ -48,7 +55,8 @@ private:
enum {
MightVectorize = (int(Derived::Flags) & int(OtherDerived::Flags) & ActualPacketAccessBit)
&& ((int(Derived::Flags)&RowMajorBit)==(int(OtherDerived::Flags)&RowMajorBit)),
- MayInnerVectorize = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0,
+ MayInnerVectorize = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
+ && int(DstIsAligned) && int(SrcIsAligned),
MayLinearVectorize = MightVectorize && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
MaySliceVectorize = MightVectorize && int(InnerMaxSize)==Dynamic /* slice vectorization can be slow, so we only
want it if the slices are big, which is indicated by InnerMaxSize rather than InnerSize, think of the case
@@ -79,7 +87,7 @@ public:
: int(NoUnrolling)
)
: int(Vectorization) == int(LinearVectorization)
- ? ( int(MayUnrollCompletely) ? int(CompleteUnrolling) : int(NoUnrolling) )
+ ? ( int(MayUnrollCompletely) && int(DstIsAligned) ? int(CompleteUnrolling) : int(NoUnrolling) )
: int(NoUnrolling)
};
};
@@ -154,7 +162,7 @@ struct ei_assign_innervec_CompleteUnrolling
inline static void run(Derived1 &dst, const Derived2 &src)
{
- dst.template copyPacket<Derived2, Aligned>(row, col, src);
+ dst.template copyPacket<Derived2, Aligned, Aligned>(row, col, src);
ei_assign_innervec_CompleteUnrolling<Derived1, Derived2,
Index+ei_packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src);
}
@@ -173,7 +181,7 @@ struct ei_assign_innervec_InnerUnrolling
{
const int row = int(Derived1::Flags)&RowMajorBit ? row_or_col : Index;
const int col = int(Derived1::Flags)&RowMajorBit ? Index : row_or_col;
- dst.template copyPacket<Derived2, Aligned>(row, col, src);
+ dst.template copyPacket<Derived2, Aligned, Aligned>(row, col, src);
ei_assign_innervec_InnerUnrolling<Derived1, Derived2,
Index+ei_packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src, row_or_col);
}
@@ -256,9 +264,9 @@ struct ei_assign_impl<Derived1, Derived2, InnerVectorization, NoUnrolling>
for(int i = 0; i < innerSize; i+=packetSize)
{
if(int(Derived1::Flags)&RowMajorBit)
- dst.template copyPacket<Derived2, Aligned>(j, i, src);
+ dst.template copyPacket<Derived2, Aligned, Aligned>(j, i, src);
else
- dst.template copyPacket<Derived2, Aligned>(i, j, src);
+ dst.template copyPacket<Derived2, Aligned, Aligned>(i, j, src);
}
}
};
@@ -298,14 +306,19 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
{
const int size = dst.size();
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
- const int alignedSize = (size/packetSize)*packetSize;
+ const int alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0
+ : ei_alignmentOffset(&dst.coeffRef(0), size);
+ const int alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
- for(int index = 0; index < alignedSize; index += packetSize)
+ for(int index = 0; index < alignedStart; index++)
+ dst.copyCoeff(index, src);
+
+ for(int index = alignedStart; index < alignedEnd; index += packetSize)
{
- dst.template copyPacket<Derived2, Aligned>(index, src);
+ dst.template copyPacket<Derived2, Aligned, ei_assign_traits<Derived1,Derived2>::SrcAlignment>(index, src);
}
- for(int index = alignedSize; index < size; index++)
+ for(int index = alignedEnd; index < size; index++)
dst.copyCoeff(index, src);
}
};
@@ -334,29 +347,45 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorization, NoUnrolling>
static void run(Derived1 &dst, const Derived2 &src)
{
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
+ const int packetAlignedMask = packetSize - 1;
const int innerSize = dst.innerSize();
const int outerSize = dst.outerSize();
- const int alignedInnerSize = (innerSize/packetSize)*packetSize;
+ const int alignedStep = (packetSize - dst.stride() % packetSize) & packetAlignedMask;
+ int alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0
+ : ei_alignmentOffset(&dst.coeffRef(0), innerSize);
for(int i = 0; i < outerSize; i++)
{
+ const int alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
+
+ // do the non-vectorizable part of the assignment
+ for (int index = 0; index<alignedStart ; index++)
+ {
+ if(Derived1::Flags&RowMajorBit)
+ dst.copyCoeff(i, index, src);
+ else
+ dst.copyCoeff(index, i, src);
+ }
+
// do the vectorizable part of the assignment
- for (int index = 0; index<alignedInnerSize ; index+=packetSize)
+ for (int index = alignedStart; index<alignedEnd; index+=packetSize)
{
if(Derived1::Flags&RowMajorBit)
- dst.template copyPacket<Derived2, Unaligned>(i, index, src);
+ dst.template copyPacket<Derived2, Aligned, Unaligned>(i, index, src);
else
- dst.template copyPacket<Derived2, Unaligned>(index, i, src);
+ dst.template copyPacket<Derived2, Aligned, Unaligned>(index, i, src);
}
// do the non-vectorizable part of the assignment
- for (int index = alignedInnerSize; index<innerSize ; index++)
+ for (int index = alignedEnd; index<innerSize ; index++)
{
if(Derived1::Flags&RowMajorBit)
dst.copyCoeff(i, index, src);
else
dst.copyCoeff(index, i, src);
}
+
+ alignedStart = (alignedStart+alignedStep)%packetSize;
}
}
};