13 files changed, 140 insertions, 43 deletions
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index e3af5f61f..d519b47be 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -386,14 +386,22 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorizedTraversal, NoUnrolling
     const int size = dst.size();
     const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
     const int alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0
-                           : ei_alignmentOffset(&dst.coeffRef(0), size);
+                           : ei_first_aligned(&dst.coeffRef(0), size);
     const int alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
 
+    EIGEN_DEBUG_VAR(&dst.coeffRef(0));
+    EIGEN_DEBUG_VAR(size);
+    EIGEN_DEBUG_VAR(packetSize);
+    EIGEN_DEBUG_VAR(alignedStart);
+    EIGEN_DEBUG_VAR(alignedEnd);
+    
     for(int index = 0; index < alignedStart; ++index)
       dst.copyCoeff(index, src);
 
     for(int index = alignedStart; index < alignedEnd; index += packetSize)
     {
+      EIGEN_DEBUG_VAR(index);
+      EIGEN_DEBUG_VAR(&dst.coeffRef(index));
       dst.template copyPacket<Derived2, Aligned, ei_assign_traits<Derived1,Derived2>::SrcAlignment>(index, src);
     }
 
@@ -431,7 +439,7 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorizedTraversal, NoUnrolling>
     const int outerSize = dst.outerSize();
     const int alignedStep = (packetSize - dst.stride() % packetSize) & packetAlignedMask;
     int alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0
-                     : ei_alignmentOffset(&dst.coeffRef(0,0), innerSize);
+                     : ei_first_aligned(&dst.coeffRef(0,0), innerSize);
 
     for(int i = 0; i < outerSize; ++i)
     {
diff --git a/Eigen/src/Core/Coeffs.h b/Eigen/src/Core/Coeffs.h
index eda36b7bb..d2b3237c5 100644
--- a/Eigen/src/Core/Coeffs.h
+++ b/Eigen/src/Core/Coeffs.h
@@ -380,35 +380,33 @@ EIGEN_STRONG_INLINE void MatrixBase<Derived>::copyPacket(int index, const Matrix
 }
 
 
-template<typename Derived, typename Integer, bool JustReturnZero>
-struct ei_alignmentOffset_impl
+template<typename Derived, bool JustReturnZero>
+struct ei_first_aligned_impl
 {
-  inline static Integer run(const MatrixBase<Derived>&, Integer)
+  inline static int run(const MatrixBase<Derived>&)
   { return 0; }
 };
 
-template<typename Derived, typename Integer>
-struct ei_alignmentOffset_impl<Derived, Integer, false>
+template<typename Derived>
+struct ei_first_aligned_impl<Derived, false>
 {
-  inline static Integer run(const MatrixBase<Derived>& m, Integer maxOffset)
+  inline static int run(const MatrixBase<Derived>& m)
   {
-    return ei_alignmentOffset(&m.const_cast_derived().coeffRef(0,0), maxOffset);
+    return ei_first_aligned(&m.const_cast_derived().coeffRef(0,0), m.size());
   }
 };
 
-/** \internal \returns the number of elements which have to be skipped, starting
-  * from the address of coeffRef(0,0), to find the first 16-byte aligned element.
-  *
-  * \note If the expression doesn't have the DirectAccessBit, this function returns 0.
+/** \internal \returns the index of the first element of the array that is well aligned for vectorization.
   *
-  * There is also the variant ei_alignmentOffset(const Scalar*, Integer) defined in Memory.h.
+  * There is also the variant ei_first_aligned(const Scalar*, Integer) defined in Memory.h. See it for more
+  * documentation.
   */
-template<typename Derived, typename Integer>
-inline static Integer ei_alignmentOffset(const MatrixBase<Derived>& m, Integer maxOffset)
+template<typename Derived>
+inline static int ei_first_aligned(const MatrixBase<Derived>& m)
 {
-  return ei_alignmentOffset_impl<Derived, Integer,
-                                 (Derived::Flags & AlignedBit) || !(Derived::Flags & DirectAccessBit)>
-                                 ::run(m, maxOffset);
+  return ei_first_aligned_impl
+           <Derived, (Derived::Flags & AlignedBit) || !(Derived::Flags & DirectAccessBit)>
+         ::run(m);
 }
 
 #endif
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index ae6b01f57..399e2cd90 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -209,7 +209,7 @@ struct ei_redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
   {
     const int size = mat.size();
     const int packetSize = ei_packet_traits<Scalar>::size;
-    const int alignedStart = ei_alignmentOffset(mat,size);
+    const int alignedStart = ei_first_aligned(mat);
     enum {
       alignment = (Derived::Flags & DirectAccessBit) || (Derived::Flags & AlignedBit)
                 ? Aligned : Unaligned
diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h
index f2d1e7240..fa1e095ca 100644
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -65,7 +65,7 @@ MatrixBase<Derived>::stableNorm() const
   int bi=0;
   if ((int(Flags)&DirectAccessBit) && !(int(Flags)&AlignedBit))
   {
-    bi = ei_alignmentOffset(&const_cast_derived().coeffRef(0), n);
+    bi = ei_first_aligned(&const_cast_derived().coeffRef(0), n);
     if (bi>0)
       ei_stable_norm_kernel(start(bi), ssq, scale, invScale);
   }
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index a18e5ef1d..3296f32ff 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -69,7 +69,7 @@ void ei_cache_friendly_product_colmajor_times_vector(
 
   // How many coeffs of the result do we have to skip to be aligned.
   // Here we assume data are at least aligned on the base scalar type.
-  int alignedStart = ei_alignmentOffset(res,size);
+  int alignedStart = ei_first_aligned(res,size);
   int alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0;
   const int peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
 
@@ -79,7 +79,7 @@ void ei_cache_friendly_product_colmajor_times_vector(
                        : FirstAligned;
 
   // we cannot assume the first element is aligned because of sub-matrices
-  const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size);
+  const int lhsAlignmentOffset = ei_first_aligned(lhs,size);
 
   // find how many columns do we have to skip to be aligned with the result (if possible)
   int skipColumns = 0;
@@ -282,7 +282,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
   // How many coeffs of the result do we have to skip to be aligned.
   // Here we assume data are at least aligned on the base scalar type
   // if that's not the case then vectorization is discarded, see below.
-  int alignedStart = ei_alignmentOffset(rhs, size);
+  int alignedStart = ei_first_aligned(rhs, size);
   int alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0;
   const int peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
 
@@ -292,7 +292,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
                        : FirstAligned;
 
   // we cannot assume the first element is aligned because of sub-matrices
-  const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size);
+  const int lhsAlignmentOffset = ei_first_aligned(lhs,size);
 
   // find how many rows do we have to skip to be aligned with rhs (if possible)
   int skipRows = 0;
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h
index c27454bee..32b7f220e 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -86,7 +86,7 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector(
     size_t starti = FirstTriangular ? 0 : j+2;
     size_t endi   = FirstTriangular ? j : size;
     size_t alignedEnd = starti;
-    size_t alignedStart = (starti) + ei_alignmentOffset(&res[starti], endi-starti);
+    size_t alignedStart = (starti) + ei_first_aligned(&res[starti], endi-starti);
     alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
 
     res[j] += cj0.pmul(A0[j], t0);
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 524bec2fc..ecc13dab4 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -209,27 +209,53 @@ template<typename T, bool Align> inline void ei_conditional_aligned_delete(T *pt
   ei_conditional_aligned_free<Align>(ptr);
 }
 
-/** \internal \returns the number of elements which have to be skipped to
-  * find the first 16-byte aligned element
+/** \internal \returns the index of the first element of the array that is well aligned for vectorization.
   *
-  * There is also the variant ei_alignmentOffset(const MatrixBase&, Integer) defined in Coeffs.h.
+  * \param array the address of the start of the array
+  * \param size the size of the array
+  *
+  * \note If no element of the array is well aligned, the size of the array is returned. Typically,
+  * for example with SSE, "well aligned" means 16-byte-aligned. If vectorization is disabled or if the
+  * packet size for the given scalar type is 1, then everything is considered well-aligned.
+  *
+  * \note If the scalar type is vectorizable, we rely on the following assumptions: sizeof(Scalar) is a
+  * power of 2, the packet size in bytes is also a power of 2, and is a multiple of sizeof(Scalar). On the
+  * other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
+  * example with Scalar=double on certain 32-bit platforms, see bug #79.
+  *
+  * There is also the variant ei_first_aligned(const MatrixBase&, Integer) defined in Coeffs.h.
   */
 template<typename Scalar, typename Integer>
-inline static Integer ei_alignmentOffset(const Scalar* ptr, Integer maxOffset)
+inline static Integer ei_first_aligned(const Scalar* array, Integer size)
 {
   typedef typename ei_packet_traits<Scalar>::type Packet;
-  const Integer PacketSize = ei_packet_traits<Scalar>::size;
-  const Integer PacketAlignedMask = PacketSize-1;
-  const bool Vectorized = PacketSize>1;
-  return Vectorized
-          ? std::min<Integer>( (PacketSize - (Integer((size_t(ptr)/sizeof(Scalar))) & PacketAlignedMask))
-                           & PacketAlignedMask, maxOffset)
-          : 0;
+  enum { PacketSize = ei_packet_traits<Scalar>::size,
+         PacketAlignedMask = PacketSize-1
+  };
+  
+  if(PacketSize==1)
+  {
+    // Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements
+    // of the array have the same aligment.
+    return 0;
+  }
+  else if(size_t(array) & (sizeof(Scalar)-1))
+  {
+    // There is vectorization for this scalar type, but the array is not aligned to the size of a single scalar.
+    // Consequently, no element of the array is well aligned.
+    return size;
+  }
+  else
+  {
+    return std::min<Integer>( (PacketSize - (Integer((size_t(array)/sizeof(Scalar))) & PacketAlignedMask))
+                           & PacketAlignedMask, size);
+  }
 }
 
 /** \internal
   * ei_aligned_stack_alloc(SIZE) allocates an aligned buffer of SIZE bytes
-  * on the stack if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT.
+  * on the stack if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and
+  * if stack allocation is supported by the platform (currently, this is linux only).
   * Otherwise the memory is allocated on the heap.
   * Data allocated with ei_aligned_stack_alloc \b must be freed by calling ei_aligned_stack_free(PTR,SIZE).
   * \code
diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h
index eeb81c178..9d6d1d171 100644
--- a/Eigen/src/Jacobi/Jacobi.h
+++ b/Eigen/src/Jacobi/Jacobi.h
@@ -318,7 +318,7 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY&
     typedef typename ei_packet_traits<Scalar>::type Packet;
     enum { PacketSize = ei_packet_traits<Scalar>::size, Peeling = 2 };
 
-    int alignedStart = ei_alignmentOffset(y, size);
+    int alignedStart = ei_first_aligned(y, size);
     int alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize;
 
     const Packet pc = ei_pset1(Scalar(j.c()));
@@ -336,7 +336,7 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY&
     Scalar* px = x + alignedStart;
     Scalar* py = y + alignedStart;
 
-    if(ei_alignmentOffset(x, size)==alignedStart)
+    if(ei_first_aligned(x, size)==alignedStart)
     {
       for(int i=alignedStart; i<alignedEnd; i+=PacketSize)
       {
diff --git a/bench/btl/libs/eigen2/eigen2_interface.hh b/bench/btl/libs/eigen2/eigen2_interface.hh
index f93ccad58..f14c8c383 100644
--- a/bench/btl/libs/eigen2/eigen2_interface.hh
+++ b/bench/btl/libs/eigen2/eigen2_interface.hh
@@ -124,7 +124,7 @@ public :
       Scalar* A0 = dst.data() + j*dst.stride();
       int starti = j;
       int alignedEnd = starti;
-      int alignedStart = (starti) + ei_alignmentOffset(&A0[starti], size-starti);
+      int alignedStart = (starti) + ei_first_aligned(&A0[starti], size-starti);
       alignedEnd = alignedStart + ((size-alignedStart)/(2*PacketSize))*(PacketSize*2);
 
       // do the non-vectorizable part of the assignment
diff --git a/bench/btl/libs/hand_vec/hand_vec_interface.hh b/bench/btl/libs/hand_vec/hand_vec_interface.hh
index 6080b2460..4b54c03a3 100755
--- a/bench/btl/libs/hand_vec/hand_vec_interface.hh
+++ b/bench/btl/libs/hand_vec/hand_vec_interface.hh
@@ -265,7 +265,7 @@ public :
       
       int starti = j+2;
       int alignedEnd = starti;
-      int alignedStart = (starti) + ei_alignmentOffset(&X[starti], N-starti);
+      int alignedStart = (starti) + ei_first_aligned(&X[starti], N-starti);
       alignedEnd = alignedStart + ((N-alignedStart)/(PacketSize))*(PacketSize);
 
       X[j]   += t0 * A0[j];
diff --git a/doc/I03_InsideEigenExample.dox b/doc/I03_InsideEigenExample.dox
index d4960e79d..95cbe6800 100644
--- a/doc/I03_InsideEigenExample.dox
+++ b/doc/I03_InsideEigenExample.dox
@@ -343,7 +343,7 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
     const int size = dst.size();
     const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
     const int alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0
-                           : ei_alignmentOffset(&dst.coeffRef(0), size);
+                           : ei_first_aligned(&dst.coeffRef(0), size);
     const int alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
 
     for(int index = 0; index < alignedStart; index++)
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index ebb8a59d2..a64e09126 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -88,6 +88,7 @@ ei_add_test(meta)
 ei_add_test(sizeof)
 ei_add_test(dynalloc)
 ei_add_test(nomalloc)
+ei_add_test(first_aligned)
 ei_add_test(mixingtypes)
 ei_add_test(packetmath)
 ei_add_test(unalignedassert)
diff --git a/test/first_aligned.cpp b/test/first_aligned.cpp
new file mode 100644
index 000000000..3cf1a7eef
--- /dev/null
+++ b/test/first_aligned.cpp
@@ -0,0 +1,64 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#include "main.h"
+
+template<typename Scalar>
+void test_first_aligned_helper(Scalar *array, int size)
+{
+  const int packet_size = sizeof(Scalar) * ei_packet_traits<Scalar>::size;
+  VERIFY(((size_t(array) + sizeof(Scalar) * ei_first_aligned(array, size)) % packet_size) == 0);
+}
+
+template<typename Scalar>
+void test_none_aligned_helper(Scalar *array, int size)
+{
+  VERIFY(ei_packet_traits<Scalar>::size == 1 || ei_first_aligned(array, size) == size);
+}
+
+struct some_non_vectorizable_type { float x; };
+
+void test_first_aligned()
+{
+  EIGEN_ALIGN16 float array_float[100];
+  test_first_aligned_helper(array_float, 50);
+  test_first_aligned_helper(array_float+1, 50);
+  test_first_aligned_helper(array_float+2, 50);
+  test_first_aligned_helper(array_float+3, 50);
+  test_first_aligned_helper(array_float+4, 50);
+  test_first_aligned_helper(array_float+5, 50);
+  
+  EIGEN_ALIGN16 double array_double[100];
+  test_first_aligned_helper(array_float, 50);
+  test_first_aligned_helper(array_float+1, 50);
+  test_first_aligned_helper(array_float+2, 50);
+  
+  double *array_double_plus_4_bytes = (double*)(size_t(array_double)+4);
+  test_none_aligned_helper(array_double_plus_4_bytes, 50);
+  test_none_aligned_helper(array_double_plus_4_bytes+1, 50);
+  
+  some_non_vectorizable_type array_nonvec[100];
+  test_first_aligned_helper(array_nonvec, 100);
+  test_none_aligned_helper(array_nonvec, 100);
+}