diff options
-rw-r--r-- | Eigen/src/Core/MatrixStorage.h | 44 | ||||
-rw-r--r-- | Eigen/src/Core/arch/AltiVec/PacketMath.h | 6 | ||||
-rw-r--r-- | Eigen/src/Core/arch/SSE/PacketMath.h | 6 | ||||
-rw-r--r-- | Eigen/src/Core/util/Macros.h | 16 | ||||
-rw-r--r-- | test/packetmath.cpp | 14 | ||||
-rw-r--r-- | test/unalignedassert.cpp | 96 |
6 files changed, 105 insertions, 77 deletions
diff --git a/Eigen/src/Core/MatrixStorage.h b/Eigen/src/Core/MatrixStorage.h index f67095d0c..654fdf5e6 100644 --- a/Eigen/src/Core/MatrixStorage.h +++ b/Eigen/src/Core/MatrixStorage.h @@ -29,29 +29,45 @@ struct ei_constructor_without_unaligned_array_assert {}; /** \internal - * Static array automatically aligned if the total byte size is a multiple of 16 and the matrix options require auto alignment + * Static array. If the MatrixOptions require auto-alignment, and the array will be automatically aligned: + * - to 16 bytes boundary, if the total size is a multiple of 16 bytes; + * - or else to 8 bytes boundary, if the total size is a multiple of 8 bytes. */ template <typename T, int Size, int MatrixOptions, - bool Align = (!(MatrixOptions&DontAlign)) && (((Size*sizeof(T))&0xf)==0) -> struct ei_matrix_array + int Alignment = (MatrixOptions&DontAlign) ? 0 + : (((Size*sizeof(T))%16)==0) ? 16 + : (((Size*sizeof(T))%8)==0) ? 8 + : 0 > +struct ei_matrix_array { - EIGEN_ALIGN_128 T array[Size]; + T array[Size]; + ei_matrix_array() {} + ei_matrix_array(ei_constructor_without_unaligned_array_assert) {} +}; - ei_matrix_array() - { - #ifndef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT - ei_assert((reinterpret_cast<size_t>(array) & 0xf) == 0 - && "this assertion is explained here: http://eigen.tuxfamily.org/dox/UnalignedArrayAssert.html **** READ THIS WEB PAGE !!! ****"); - #endif - } +#ifdef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT + #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) +#else + #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \ + ei_assert((reinterpret_cast<size_t>(array) & sizemask) == 0 \ + && "this assertion is explained here: " \ + "http://eigen.tuxfamily.org/dox/UnalignedArrayAssert.html" \ + " **** READ THIS WEB PAGE !!! ****"); +#endif +template <typename T, int Size, int MatrixOptions> +struct ei_matrix_array<T, Size, MatrixOptions, 16> +{ + EIGEN_ALIGN16 T array[Size]; + ei_matrix_array() { EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(0xf) } ei_matrix_array(ei_constructor_without_unaligned_array_assert) {} }; -template <typename T, int Size, int MatrixOptions> struct ei_matrix_array<T,Size,MatrixOptions,false> +template <typename T, int Size, int MatrixOptions> +struct ei_matrix_array<T, Size, MatrixOptions, 8> { - T array[Size]; - ei_matrix_array() {} + EIGEN_ALIGN8 T array[Size]; + ei_matrix_array() { EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(0x7) } ei_matrix_array(ei_constructor_without_unaligned_array_assert) {} }; diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index a9c16200e..1526a4b97 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -265,14 +265,14 @@ template<> inline void ei_pstoreu(int* to , const v4i& from ) template<> inline float ei_pfirst(const v4f& a) { - float EIGEN_ALIGN_128 af[4]; + float EIGEN_ALIGN16 af[4]; vec_st(a, 0, af); return af[0]; } template<> inline int ei_pfirst(const v4i& a) { - int EIGEN_ALIGN_128 ai[4]; + int EIGEN_ALIGN16 ai[4]; vec_st(a, 0, ai); return ai[0]; } @@ -373,7 +373,7 @@ inline float ei_predux_mul(const v4f& a) inline int ei_predux_mul(const v4i& a) { - EIGEN_ALIGN_128 int aux[4]; + EIGEN_ALIGN16 int aux[4]; ei_pstore(aux, a); return aux[0] * aux[1] * aux[2] * aux[3]; } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index ddc7b4aaf..eb1c2d311 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -359,7 +359,7 @@ template<> EIGEN_STRONG_INLINE int ei_predux_mul<Packet4i>(const Packet4i& a) // after some experiments, it is seems this is the fastest way to implement it // for GCC (eg., reusing ei_pmul is very slow !) // TODO try to call _mm_mul_epu32 directly - EIGEN_ALIGN_128 int aux[4]; + EIGEN_ALIGN16 int aux[4]; ei_pstore(aux, a); return (aux[0] * aux[1]) * (aux[2] * aux[3]);; } @@ -378,7 +378,7 @@ template<> EIGEN_STRONG_INLINE int ei_predux_min<Packet4i>(const Packet4i& a) { // after some experiments, it is seems this is the fastest way to implement it // for GCC (eg., it does not like using std::min after the ei_pstore !!) - EIGEN_ALIGN_128 int aux[4]; + EIGEN_ALIGN16 int aux[4]; ei_pstore(aux, a); register int aux0 = aux[0]<aux[1] ? aux[0] : aux[1]; register int aux2 = aux[2]<aux[3] ? aux[2] : aux[3]; @@ -399,7 +399,7 @@ template<> EIGEN_STRONG_INLINE int ei_predux_max<Packet4i>(const Packet4i& a) { // after some experiments, it is seems this is the fastest way to implement it // for GCC (eg., it does not like using std::min after the ei_pstore !!) - EIGEN_ALIGN_128 int aux[4]; + EIGEN_ALIGN16 int aux[4]; ei_pstore(aux, a); register int aux0 = aux[0]>aux[1] ? aux[0] : aux[1]; register int aux2 = aux[2]>aux[3] ? aux[2] : aux[3]; diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 71962bcae..fb149e50a 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -202,25 +202,29 @@ using Eigen::ei_cos; #define EIGEN_ASM_COMMENT(X) #endif -/* EIGEN_ALIGN_128 forces data to be 16-byte aligned, EVEN if vectorization (EIGEN_VECTORIZE) is disabled, +/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements. + * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled, * so that vectorization doesn't affect binary compatibility. * * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link * vectorized and non-vectorized code. */ #if !EIGEN_ALIGN - #define EIGEN_ALIGN_128 + #define EIGEN_ALIGN_TO_BOUNDARY(n) #elif (defined __GNUC__) - #define EIGEN_ALIGN_128 __attribute__((aligned(16))) + #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) #elif (defined _MSC_VER) - #define EIGEN_ALIGN_128 __declspec(align(16)) + #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) #elif (defined __SUNPRO_CC) // FIXME not sure about this one: - #define EIGEN_ALIGN_128 __attribute__((aligned(16))) + #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) #else - #error Please tell me what is the equivalent of __attribute__((aligned(16))) for your compiler + #error Please tell me what is the equivalent of __attribute__((aligned(n))) for your compiler #endif +#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16) +#define EIGEN_ALIGN8 EIGEN_ALIGN_TO_BOUNDARY(8) + #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD #define EIGEN_RESTRICT #endif diff --git a/test/packetmath.cpp b/test/packetmath.cpp index d86d40d68..1745ae5c6 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -99,10 +99,10 @@ template<typename Scalar> void packetmath() const int PacketSize = ei_packet_traits<Scalar>::size; const int size = PacketSize*4; - EIGEN_ALIGN_128 Scalar data1[ei_packet_traits<Scalar>::size*4]; - EIGEN_ALIGN_128 Scalar data2[ei_packet_traits<Scalar>::size*4]; - EIGEN_ALIGN_128 Packet packets[PacketSize*2]; - EIGEN_ALIGN_128 Scalar ref[ei_packet_traits<Scalar>::size*4]; + EIGEN_ALIGN16 Scalar data1[ei_packet_traits<Scalar>::size*4]; + EIGEN_ALIGN16 Scalar data2[ei_packet_traits<Scalar>::size*4]; + EIGEN_ALIGN16 Packet packets[PacketSize*2]; + EIGEN_ALIGN16 Scalar ref[ei_packet_traits<Scalar>::size*4]; for (int i=0; i<size; ++i) { data1[i] = ei_random<Scalar>(); @@ -202,9 +202,9 @@ template<typename Scalar> void packetmath_real() const int PacketSize = ei_packet_traits<Scalar>::size; const int size = PacketSize*4; - EIGEN_ALIGN_128 Scalar data1[ei_packet_traits<Scalar>::size*4]; - EIGEN_ALIGN_128 Scalar data2[ei_packet_traits<Scalar>::size*4]; - EIGEN_ALIGN_128 Scalar ref[ei_packet_traits<Scalar>::size*4]; + EIGEN_ALIGN16 Scalar data1[ei_packet_traits<Scalar>::size*4]; + EIGEN_ALIGN16 Scalar data2[ei_packet_traits<Scalar>::size*4]; + EIGEN_ALIGN16 Scalar ref[ei_packet_traits<Scalar>::size*4]; for (int i=0; i<size; ++i) { diff --git a/test/unalignedassert.cpp b/test/unalignedassert.cpp index ade1ab26e..8acc90158 100644 --- a/test/unalignedassert.cpp +++ b/test/unalignedassert.cpp @@ -24,52 +24,38 @@ #include "main.h" -struct Good1 +struct TestNew1 { MatrixXd m; // good: m will allocate its own array, taking care of alignment. - Good1() : m(20,20) {} + TestNew1() : m(20,20) {} }; -struct Good2 +struct TestNew2 { - Matrix3d m; // good: m's size isn't a multiple of 16 bytes, so m doesn't have to be aligned + Matrix3d m; // good: m's size isn't a multiple of 16 bytes, so m doesn't have to be 16-byte aligned, + // 8-byte alignment is good enough here, which we'll get automatically }; -struct Good3 +struct TestNew3 { - Vector2f m; // good: same reason + Vector2f m; // good: m's size isn't a multiple of 16 bytes, so m doesn't have to be 16-byte aligned }; -struct Bad4 -{ - Vector2d m; // bad: sizeof(m)%16==0 so alignment is required -}; - -struct Bad5 -{ - Matrix<float, 2, 6> m; // bad: same reason -}; - -struct Bad6 -{ - Matrix<double, 3, 4> m; // bad: same reason -}; - -struct Good7 +struct TestNew4 { EIGEN_MAKE_ALIGNED_OPERATOR_NEW Vector2d m; float f; // make the struct have sizeof%16!=0 to make it a little more tricky when we allow an array of 2 such objects }; -struct Good8 +struct TestNew5 { EIGEN_MAKE_ALIGNED_OPERATOR_NEW - float f; // try the f at first -- the EIGEN_ALIGN_128 attribute of m should make that still work + float f; // try the f at first -- the EIGEN_ALIGN16 attribute of m should make that still work Matrix4f m; }; -struct Good9 +struct TestNew6 { Matrix<float,2,2,DontAlign> m; // good: no alignment requested float f; @@ -94,34 +80,56 @@ void check_unalignedassert_good() #if EIGEN_ALIGN template<typename T> -void check_unalignedassert_bad() +void construct_at_boundary(int boundary) { - float buf[sizeof(T)+16]; - float *unaligned = buf; - while((reinterpret_cast<size_t>(unaligned)&0xf)==0) ++unaligned; // make sure unaligned is really unaligned - T *x = ::new(static_cast<void*>(unaligned)) T; + char buf[sizeof(T)+256]; + size_t _buf = reinterpret_cast<size_t>(buf); + _buf += (16 - (_buf % 16)); // make 16-byte aligned + _buf += boundary; // make exact boundary-aligned + T *x = ::new(reinterpret_cast<void*>(_buf)) T; x->~T(); } #endif void unalignedassert() { - check_unalignedassert_good<Good1>(); - check_unalignedassert_good<Good2>(); - check_unalignedassert_good<Good3>(); -#if EIGEN_ALIGN - VERIFY_RAISES_ASSERT(check_unalignedassert_bad<Bad4>()); - VERIFY_RAISES_ASSERT(check_unalignedassert_bad<Bad5>()); - VERIFY_RAISES_ASSERT(check_unalignedassert_bad<Bad6>()); -#endif - - check_unalignedassert_good<Good7>(); - check_unalignedassert_good<Good8>(); - check_unalignedassert_good<Good9>(); + construct_at_boundary<Vector2f>(8); + construct_at_boundary<Vector3f>(4); + construct_at_boundary<Vector4f>(16); + construct_at_boundary<Matrix2f>(16); + construct_at_boundary<Matrix3f>(4); + construct_at_boundary<Matrix4f>(16); + + construct_at_boundary<Vector2d>(16); + construct_at_boundary<Vector3d>(8); + construct_at_boundary<Vector4d>(16); + construct_at_boundary<Matrix2d>(16); + construct_at_boundary<Matrix3d>(8); + construct_at_boundary<Matrix4d>(16); + + check_unalignedassert_good<TestNew1>(); + check_unalignedassert_good<TestNew2>(); + check_unalignedassert_good<TestNew3>(); + + check_unalignedassert_good<TestNew4>(); + check_unalignedassert_good<TestNew5>(); + check_unalignedassert_good<TestNew6>(); check_unalignedassert_good<Depends<true> >(); - + #if EIGEN_ALIGN - VERIFY_RAISES_ASSERT(check_unalignedassert_bad<Depends<false> >()); + VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2f>(4)); + VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4f>(4)); + VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4f>(8)); + VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix2f>(4)); + VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix2f>(8)); + VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4f>(4)); + VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4f>(8)); + VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2d>(8)); + VERIFY_RAISES_ASSERT(construct_at_boundary<Vector3d>(4)); + VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4d>(8)); + VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix2d>(8)); + VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix3d>(4)); + VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4d>(8)); #endif } |