24 files changed, 813 insertions, 288 deletions
diff --git a/Eigen/Core b/Eigen/Core
index 5e3e0960a..3135d7530 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -221,6 +221,7 @@ using std::size_t;
 #if defined EIGEN_VECTORIZE_SSE
   #include "src/Core/arch/SSE/PacketMath.h"
   #include "src/Core/arch/SSE/MathFunctions.h"
+  #include "src/Core/arch/SSE/Complex.h"
 #elif defined EIGEN_VECTORIZE_ALTIVEC
   #include "src/Core/arch/AltiVec/PacketMath.h"
 #elif defined EIGEN_VECTORIZE_NEON
diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h
index 512e93883..8c2eacd96 100644
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -126,6 +126,9 @@ template<typename MatrixType, int DiagIndex> class Diagonal
     EIGEN_STRONG_INLINE Index absDiagIndex() const { return m_index.value()>0 ? m_index.value() : -m_index.value(); }
     EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value()>0 ? 0 : -m_index.value(); }
     EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value()>0 ? m_index.value() : 0; }
+    // triger a compile time error is someone try to call packet
+    template<int LoadMode> typename MatrixType::PacketReturnType packet(Index) const;
+    template<int LoadMode> typename MatrixType::PacketReturnType packet(Index,Index) const;
 };
 
 /** \returns an expression of the main diagonal of the matrix \c *this
diff --git a/Eigen/src/Core/DiagonalProduct.h b/Eigen/src/Core/DiagonalProduct.h
index 7caf3858f..610d13dc8 100644
--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h
@@ -36,8 +36,16 @@ struct ei_traits<DiagonalProduct<MatrixType, DiagonalType, ProductOrder> >
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Flags = (HereditaryBits & (unsigned int)(MatrixType::Flags))
-          | (PacketAccessBit & (unsigned int)(MatrixType::Flags) & (unsigned int)(DiagonalType::DiagonalVectorType::Flags)),
+
+    _StorageOrder = MatrixType::Flags & RowMajorBit ? RowMajor : ColMajor,
+    _PacketOnDiag = !((int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheLeft)
+                    ||(int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheRight)),
+    _SameTypes = ei_is_same_type<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ret,
+    // FIXME currently we need same types, but in the future the next rule should be the one
+    //_Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagonalType::Flags)&PacketAccessBit))),
+    _Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && _SameTypes && ((!_PacketOnDiag) || (bool(int(DiagonalType::Flags)&PacketAccessBit))),
+
+    Flags = (HereditaryBits & (unsigned int)(MatrixType::Flags)) | (_Vectorizable ? PacketAccessBit : 0),
     CoeffReadCost = NumTraits<Scalar>::MulCost + MatrixType::CoeffReadCost + DiagonalType::DiagonalVectorType::CoeffReadCost
   };
 };
@@ -69,26 +77,34 @@ class DiagonalProduct : ei_no_assignment_operator,
     EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const
     {
       enum {
-        StorageOrder = Flags & RowMajorBit ? RowMajor : ColMajor,
-        InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
-        DiagonalVectorPacketLoadMode = (LoadMode == Aligned && ((InnerSize%16) == 0)) ? Aligned : Unaligned
+        StorageOrder = Flags & RowMajorBit ? RowMajor : ColMajor
       };
       const Index indexInDiagonalVector = ProductOrder == OnTheLeft ? row : col;
 
-      if((int(StorageOrder) == RowMajor && int(ProductOrder) == OnTheLeft)
-       ||(int(StorageOrder) == ColMajor && int(ProductOrder) == OnTheRight))
-      {
-        return ei_pmul(m_matrix.template packet<LoadMode>(row, col),
-                       ei_pset1(m_diagonal.diagonal().coeff(indexInDiagonalVector)));
-      }
-      else
-      {
-        return ei_pmul(m_matrix.template packet<LoadMode>(row, col),
-                       m_diagonal.diagonal().template packet<DiagonalVectorPacketLoadMode>(indexInDiagonalVector));
-      }
+      return packet_impl<LoadMode>(row,col,indexInDiagonalVector,typename ei_meta_if<
+        ((int(StorageOrder) == RowMajor && int(ProductOrder) == OnTheLeft)
+       ||(int(StorageOrder) == ColMajor && int(ProductOrder) == OnTheRight)), ei_meta_true, ei_meta_false>::ret());
     }
 
   protected:
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, ei_meta_true) const
+    {
+      return ei_pmul(m_matrix.template packet<LoadMode>(row, col),
+                     ei_pset1(m_diagonal.diagonal().coeff(id)));
+    }
+
+    template<int LoadMode>
+    EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, ei_meta_false) const
+    {
+      enum {
+        InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
+        DiagonalVectorPacketLoadMode = (LoadMode == Aligned && ((InnerSize%16) == 0)) ? Aligned : Unaligned
+      };
+      return ei_pmul(m_matrix.template packet<LoadMode>(row, col),
+                     m_diagonal.diagonal().template packet<DiagonalVectorPacketLoadMode>(id));
+    }
+
     const typename MatrixType::Nested m_matrix;
     const typename DiagonalType::Nested m_diagonal;
 };
diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index 8eaa62185..9fc2fb60e 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -41,7 +41,7 @@ struct ei_dot_nocheck
 {
   static inline typename ei_traits<T>::Scalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
   {
-    return a.conjugate().cwiseProduct(b).sum();
+    return a.template binaryExpr<ei_scalar_conj_product_op<typename ei_traits<T>::Scalar> >(b).sum();
   }
 };
 
@@ -50,7 +50,7 @@ struct ei_dot_nocheck<T, U, true>
 {
   static inline typename ei_traits<T>::Scalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
   {
-    return a.adjoint().cwiseProduct(b).sum();
+    return a.transpose().template binaryExpr<ei_scalar_conj_product_op<typename ei_traits<T>::Scalar> >(b).sum();
   }
 };
 
diff --git a/Eigen/src/Core/Functors.h b/Eigen/src/Core/Functors.h
index 78d1e5628..9084905aa 100644
--- a/Eigen/src/Core/Functors.h
+++ b/Eigen/src/Core/Functors.h
@@ -46,7 +46,7 @@ template<typename Scalar>
 struct ei_functor_traits<ei_scalar_sum_op<Scalar> > {
   enum {
     Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = ei_packet_traits<Scalar>::size>1
+    PacketAccess = ei_packet_traits<Scalar>::HasAdd
   };
 };
 
@@ -69,7 +69,29 @@ template<typename Scalar>
 struct ei_functor_traits<ei_scalar_product_op<Scalar> > {
   enum {
     Cost = NumTraits<Scalar>::MulCost,
-    PacketAccess = ei_packet_traits<Scalar>::size>1
+    PacketAccess = ei_packet_traits<Scalar>::HasMul
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the conjugate product of two scalars
+  *
+  * This is a short cut for ei_conj(x) * y which is needed for optimization purpose
+  */
+template<typename Scalar> struct ei_scalar_conj_product_op {
+  enum { Conj = NumTraits<Scalar>::IsComplex };
+  EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_conj_product_op)
+  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const
+  { return ei_conj_helper<Scalar,Scalar,Conj,false>().pmul(a,b); }
+  template<typename PacketScalar>
+  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
+  { return ei_conj_helper<PacketScalar,PacketScalar,Conj,false>().pmul(a,b); }
+};
+template<typename Scalar>
+struct ei_functor_traits<ei_scalar_conj_product_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = ei_packet_traits<Scalar>::HasMul
   };
 };
 
@@ -92,7 +114,7 @@ template<typename Scalar>
 struct ei_functor_traits<ei_scalar_min_op<Scalar> > {
   enum {
     Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = ei_packet_traits<Scalar>::size>1
+    PacketAccess = ei_packet_traits<Scalar>::HasMin
   };
 };
 
@@ -115,7 +137,7 @@ template<typename Scalar>
 struct ei_functor_traits<ei_scalar_max_op<Scalar> > {
   enum {
     Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = ei_packet_traits<Scalar>::size>1
+    PacketAccess = ei_packet_traits<Scalar>::HasMax
   };
 };
 
@@ -158,7 +180,7 @@ template<typename Scalar>
 struct ei_functor_traits<ei_scalar_difference_op<Scalar> > {
   enum {
     Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = ei_packet_traits<Scalar>::size>1
+    PacketAccess = ei_packet_traits<Scalar>::HasSub
   };
 };
 
@@ -178,10 +200,7 @@ template<typename Scalar>
 struct ei_functor_traits<ei_scalar_quotient_op<Scalar> > {
   enum {
     Cost = 2 * NumTraits<Scalar>::MulCost,
-    PacketAccess = ei_packet_traits<Scalar>::size>1
-                  #if (defined EIGEN_VECTORIZE)
-                  && !NumTraits<Scalar>::IsInteger
-                  #endif
+    PacketAccess = ei_packet_traits<Scalar>::HasDiv
   };
 };
 
@@ -203,7 +222,7 @@ template<typename Scalar>
 struct ei_functor_traits<ei_scalar_opposite_op<Scalar> >
 { enum {
     Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = int(ei_packet_traits<Scalar>::size)>1 };
+    PacketAccess = ei_packet_traits<Scalar>::HasNegate };
 };
 
 /** \internal
@@ -224,7 +243,7 @@ struct ei_functor_traits<ei_scalar_abs_op<Scalar> >
 {
   enum {
     Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = int(ei_packet_traits<Scalar>::size)>1
+    PacketAccess = ei_packet_traits<Scalar>::HasAbs
   };
 };
 
@@ -243,7 +262,7 @@ template<typename Scalar> struct ei_scalar_abs2_op {
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_abs2_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = int(ei_packet_traits<Scalar>::size)>1 }; };
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = ei_packet_traits<Scalar>::HasAbs2 }; };
 
 /** \internal
   * \brief Template functor to compute the conjugate of a complex value
@@ -254,14 +273,14 @@ template<typename Scalar> struct ei_scalar_conjugate_op {
   EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_conjugate_op)
   EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return ei_conj(a); }
   template<typename PacketScalar>
-  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const { return a; }
+  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const { return ei_pconj(a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_conjugate_op<Scalar> >
 {
   enum {
     Cost = NumTraits<Scalar>::IsComplex ? NumTraits<Scalar>::AddCost : 0,
-    PacketAccess = int(ei_packet_traits<Scalar>::size)>1
+    PacketAccess = ei_packet_traits<Scalar>::HasConj
   };
 };
 
@@ -398,7 +417,7 @@ struct ei_scalar_multiple_op {
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_multiple_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = ei_packet_traits<Scalar>::size>1 }; };
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = ei_packet_traits<Scalar>::HasMul }; };
 
 template<typename Scalar1, typename Scalar2>
 struct ei_scalar_multiple2_op {
@@ -425,7 +444,7 @@ struct ei_scalar_quotient1_impl {
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_quotient1_impl<Scalar,false> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = ei_packet_traits<Scalar>::size>1 }; };
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = ei_packet_traits<Scalar>::HasMul }; };
 
 template<typename Scalar>
 struct ei_scalar_quotient1_impl<Scalar,true> {
@@ -472,7 +491,8 @@ struct ei_scalar_constant_op {
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_constant_op<Scalar> >
-{ enum { Cost = 1, PacketAccess = ei_packet_traits<Scalar>::size>1, IsRepeatable = true }; };
+// FIXME replace this packet test by a safe one
+{ enum { Cost = 1, PacketAccess = ei_packet_traits<Scalar>::Vectorizable, IsRepeatable = true }; };
 
 template<typename Scalar> struct ei_scalar_identity_op {
   EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_identity_op)
@@ -543,7 +563,7 @@ struct ei_linspaced_op_impl<Scalar,true>
 // nested expressions).
 template <typename Scalar, bool RandomAccess = true> struct ei_linspaced_op;
 template <typename Scalar, bool RandomAccess> struct ei_functor_traits< ei_linspaced_op<Scalar,RandomAccess> >
-{ enum { Cost = 1, PacketAccess = ei_packet_traits<Scalar>::size>1, IsRepeatable = true }; };
+{ enum { Cost = 1, PacketAccess = ei_packet_traits<Scalar>::HasSetLinear, IsRepeatable = true }; };
 template <typename Scalar, bool RandomAccess> struct ei_linspaced_op
 {
   typedef typename ei_packet_traits<Scalar>::type PacketScalar;
@@ -588,7 +608,7 @@ struct ei_scalar_add_op {
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_add_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = ei_packet_traits<Scalar>::size>1 }; };
+{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = ei_packet_traits<Scalar>::HasAdd }; };
 
 /** \internal
   * \brief Template functor to compute the square root of a scalar
@@ -676,7 +696,7 @@ struct ei_scalar_inverse_op {
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_inverse_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = int(ei_packet_traits<Scalar>::size)>1 }; };
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = ei_packet_traits<Scalar>::HasDiv }; };
 
 /** \internal
   * \brief Template functor to compute the square of a scalar
@@ -692,7 +712,7 @@ struct ei_scalar_square_op {
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_square_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = int(ei_packet_traits<Scalar>::size)>1 }; };
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = ei_packet_traits<Scalar>::HasMul }; };
 
 /** \internal
   * \brief Template functor to compute the cube of a scalar
@@ -708,7 +728,7 @@ struct ei_scalar_cube_op {
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_cube_op<Scalar> >
-{ enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = int(ei_packet_traits<Scalar>::size)>1 }; };
+{ enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = ei_packet_traits<Scalar>::HasMul }; };
 
 // default functor traits for STL functors:
 
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 643e12e34..77b1d748e 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -58,8 +58,11 @@ struct ei_default_packet_traits
     HasMul    = 1,
     HasNegate = 1,
     HasAbs    = 1,
+    HasAbs2   = 1,
     HasMin    = 1,
     HasMax    = 1,
+    HasConj   = 1,
+    HasSetLinear = 1,
 
     HasDiv    = 0,
     HasSqrt   = 0,
@@ -79,15 +82,21 @@ struct ei_default_packet_traits
 template<typename T> struct ei_packet_traits : ei_default_packet_traits
 {
   typedef T type;
-  enum {size=1};
+  enum {
+    Vectorizable = 0,
+    size = 1
+  };
   enum {
     HasAdd    = 0,
     HasSub    = 0,
     HasMul    = 0,
     HasNegate = 0,
     HasAbs    = 0,
+    HasAbs2   = 0,
     HasMin    = 0,
-    HasMax    = 0
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0
   };
 };
 
@@ -105,6 +114,10 @@ ei_psub(const Packet& a,
 template<typename Packet> inline Packet
 ei_pnegate(const Packet& a) { return -a; }
 
+/** \internal \returns conj(a) (coeff-wise) */
+template<typename Packet> inline Packet
+ei_pconj(const Packet& a) { return ei_conj(a); }
+
 /** \internal \returns a * b (coeff-wise) */
 template<typename Packet> inline Packet
 ei_pmul(const Packet& a,
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 66435e0e3..ca30c4dce 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -378,6 +378,8 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true>
                                * RhsBlasTraits::extractScalarFactor(prod.rhs());
 
     enum {
+      // FIXME I think here we really have to check for ei_packet_traits<Scalar>::size==1
+      // because in this case it is fine to have an inner stride
       DirectlyUseRhs = ((ei_packet_traits<Scalar>::size==1) || (_ActualRhsType::Flags&ActualPacketAccessBit))
                      && (!(_ActualRhsType::Flags & RowMajorBit))
     };
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index e3454b554..9e6a375ea 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -85,8 +85,12 @@ static Packet4f ei_p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)ei_p4i_MINUS1, (Pack
 
 template<> struct ei_packet_traits<float>  : ei_default_packet_traits
 {
-  typedef Packet4f type; enum {size=4};
+  typedef Packet4f type;
   enum {
+    Vectorizable = 1,
+    size=4,
+
+    // FIXME check the Has*
     HasSin  = 0,
     HasCos  = 0,
     HasLog  = 0,
@@ -95,7 +99,14 @@ template<> struct ei_packet_traits<float>  : ei_default_packet_traits
   };
 };
 template<> struct ei_packet_traits<int>    : ei_default_packet_traits
-{ typedef Packet4i type; enum {size=4}; };
+{
+  typedef Packet4i type;
+  enum {
+    // FIXME check the Has*
+    Vectorizable = 1,
+    size=4
+  };
+};
 
 template<> struct ei_unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
 template<> struct ei_unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 2f0efbcc9..aaa27b56d 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -59,8 +59,12 @@ typedef int32x4_t   Packet4i;
 
 template<> struct ei_packet_traits<float>  : ei_default_packet_traits
 {
-  typedef Packet4f type; enum {size=4};
+  typedef Packet4f type;
   enum {
+    Vectorizable = 1,
+    size = 4,
+
+    // FIXME check the Has*
     HasSin  = 0,
     HasCos  = 0,
     HasLog  = 0,
@@ -69,7 +73,14 @@ template<> struct ei_packet_traits<float>  : ei_default_packet_traits
   };
 };
 template<> struct ei_packet_traits<int>    : ei_default_packet_traits
-{ typedef Packet4i type; enum {size=4}; };
+{
+  typedef Packet4i type;
+  enum {
+    Vectorizable = 1,
+    size=4
+    // FIXME check the Has*
+  };
+};
 
 template<> struct ei_unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
 template<> struct ei_unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
new file mode 100644
index 000000000..4ecfc2f43
--- /dev/null
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -0,0 +1,370 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_COMPLEX_SSE_H
+#define EIGEN_COMPLEX_SSE_H
+
+//---------- float ----------
+struct Packet2cf
+{
+  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {}
+  __m128  v;
+};
+
+template<> struct ei_packet_traits<std::complex<float> >  : ei_default_packet_traits
+{
+  typedef Packet2cf type;
+  enum {
+    Vectorizable = 1,
+    size = 2,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>&  from)
+{
+  Packet2cf res;
+  res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
+  return Packet2cf(_mm_movelh_ps(res.v,res.v));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pnegate(const Packet2cf& a)
+{
+  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
+  return Packet2cf(_mm_xor_ps(a.v,mask));
+}
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pconj(const Packet2cf& a)
+{
+  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
+  return Packet2cf(_mm_xor_ps(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for SSE3 and 4
+  #ifdef EIGEN_VECTORIZE_SSE3
+  return Packet2cf(_mm_addsub_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
+                                 _mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3),
+                                            ei_vec4f_swizzle1(b.v, 1, 0, 3, 2))));
+  #else
+  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000));
+  return Packet2cf(_mm_add_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
+                              _mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3),
+                                                    ei_vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
+  #endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pload <std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(_mm_load_ps((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu<std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ei_ploadu((const float*)from)); }
+
+template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((float*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+
+template<> EIGEN_STRONG_INLINE std::complex<float>  ei_pfirst<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> res;
+  _mm_storel_pi((__m64*)&res, a.v);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(ei_preverse(_mm_castps_pd(a.v)))); }
+
+template<> EIGEN_STRONG_INLINE std::complex<float> ei_predux<Packet2cf>(const Packet2cf& a)
+{
+  return ei_pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_preduxp<Packet2cf>(const Packet2cf* vecs)
+{
+  return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v)));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> ei_predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  return ei_pfirst(ei_pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v))));
+}
+
+template<int Offset>
+struct ei_palign_impl<Offset,Packet2cf>
+{
+  EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second)
+  {
+    if (Offset==1)
+    {
+      first.v = _mm_movehl_ps(first.v, first.v);
+      first.v = _mm_movelh_ps(first.v, second.v);
+    }
+  }
+};
+
+template<> struct ei_conj_helper<Packet2cf, Packet2cf, false,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return ei_padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    #ifdef EIGEN_VECTORIZE_SSE3
+    return ei_pmul(a, ei_pconj(b));
+    #else
+    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
+    return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
+                                _mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3),
+                                           ei_vec4f_swizzle1(b.v, 1, 0, 3, 2))));
+    #endif
+  }
+};
+
+template<> struct ei_conj_helper<Packet2cf, Packet2cf, true,false>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return ei_padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    #ifdef EIGEN_VECTORIZE_SSE3
+    return ei_pmul(ei_pconj(a), b);
+    #else
+    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
+    return Packet2cf(_mm_add_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
+                                _mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3),
+                                                      ei_vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
+    #endif
+  }
+};
+
+template<> struct ei_conj_helper<Packet2cf, Packet2cf, true,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return ei_padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    #ifdef EIGEN_VECTORIZE_SSE3
+    return ei_pconj(ei_pmul(a, b));
+    #else
+    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
+    return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
+                                _mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3),
+                                           ei_vec4f_swizzle1(b.v, 1, 0, 3, 2))));
+    #endif
+  }
+};
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for SSE3 and 4
+  Packet2cf res = ei_conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
+  __m128 s = _mm_mul_ps(b.v,b.v);
+  return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1)))));
+}
+
+//---------- double ----------
+struct Packet1cd
+{
+  EIGEN_STRONG_INLINE Packet1cd() {}
+  EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {}
+  __m128d  v;
+};
+
+template<> struct ei_packet_traits<std::complex<double> >  : ei_default_packet_traits
+{
+  typedef Packet1cd type;
+  enum {
+    Vectorizable = 1,
+    size = 1,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct ei_unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; };
+
+template<> EIGEN_STRONG_INLINE Packet1cd ei_padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ei_psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ei_pnegate(const Packet1cd& a) { return Packet1cd(ei_pnegate(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ei_pconj(const Packet1cd& a)
+{
+  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
+  return Packet1cd(_mm_xor_pd(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd ei_pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  // TODO optimize it for SSE3 and 4
+  #ifdef EIGEN_VECTORIZE_SSE3
+  return Packet1cd(_mm_addsub_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v),
+                                 _mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1),
+                                            ei_vec2d_swizzle1(b.v, 1, 0))));
+  #else
+  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
+  return Packet1cd(_mm_add_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v),
+                              _mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1),
+                                                    ei_vec2d_swizzle1(b.v, 1, 0)), mask)));
+  #endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd ei_pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ei_por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ei_pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ei_pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(_mm_load_pd((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu<std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1<std::complex<double> >(const std::complex<double>&  from)
+{ /* here we really have to use unaligned loads :( */ return ei_ploadu(&from); }
+
+template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((double*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+
+template<> EIGEN_STRONG_INLINE std::complex<double>  ei_pfirst<Packet1cd>(const Packet1cd& a)
+{
+  EIGEN_ALIGN16 std::complex<double> res;
+  _mm_store_pd((double*)&res, a.v);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd ei_preverse(const Packet1cd& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE std::complex<double> ei_predux<Packet1cd>(const Packet1cd& a)
+{
+  return ei_pfirst(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd ei_preduxp<Packet1cd>(const Packet1cd* vecs)
+{
+  return vecs[0];
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> ei_predux_mul<Packet1cd>(const Packet1cd& a)
+{
+  return ei_pfirst(a);
+}
+
+template<int Offset>
+struct ei_palign_impl<Offset,Packet1cd>
+{
+  EIGEN_STRONG_INLINE static void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
+  {
+    // FIXME is it sure we never have to align a Packet1cd?
+    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
+  }
+};
+
+template<> struct ei_conj_helper<Packet1cd, Packet1cd, false,true>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return ei_padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    #ifdef EIGEN_VECTORIZE_SSE3
+    return ei_pmul(a, ei_pconj(b));
+    #else
+    const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
+    return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), mask),
+                                _mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1),
+                                           ei_vec2d_swizzle1(b.v, 1, 0))));
+    #endif
+  }
+};
+
+template<> struct ei_conj_helper<Packet1cd, Packet1cd, true,false>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return ei_padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    #ifdef EIGEN_VECTORIZE_SSE3
+    return ei_pmul(ei_pconj(a), b);
+    #else
+    const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
+    return Packet1cd(_mm_add_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v),
+                                _mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1),
+                                                      ei_vec2d_swizzle1(b.v, 1, 0)), mask)));
+    #endif
+  }
+};
+
+template<> struct ei_conj_helper<Packet1cd, Packet1cd, true,true>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return ei_padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    #ifdef EIGEN_VECTORIZE_SSE3
+    return ei_pconj(ei_pmul(a, b));
+    #else
+    const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
+    return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), mask),
+                                _mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1),
+                                           ei_vec2d_swizzle1(b.v, 1, 0))));
+    #endif
+  }
+};
+
+template<> EIGEN_STRONG_INLINE Packet1cd ei_pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  // TODO optimize it for SSE3 and 4
+  Packet1cd res = ei_conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+  __m128d s = _mm_mul_pd(b.v,b.v);
+  return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1))));
+}
+
+#endif // EIGEN_COMPLEX_SSE_H
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 29375bdae..9382fbde5 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -43,6 +43,9 @@ template<> struct ei_is_arithmetic<__m128d> { enum { ret = true }; };
 #define ei_vec4i_swizzle1(v,p,q,r,s) \
   (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p))))
 
+#define ei_vec2d_swizzle1(v,p,q) \
+  (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
+
 #define ei_vec4f_swizzle2(a,b,p,q,r,s) \
   (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
 
@@ -58,10 +61,15 @@ template<> struct ei_is_arithmetic<__m128d> { enum { ret = true }; };
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
   const Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
 
+
 template<> struct ei_packet_traits<float>  : ei_default_packet_traits
 {
-  typedef Packet4f type; enum {size=4};
+  typedef Packet4f type;
   enum {
+    Vectorizable = 1,
+    size=4,
+
+    HasDiv    = 1,
     HasSin  = EIGEN_FAST_MATH,
     HasCos  = EIGEN_FAST_MATH,
     HasLog  = 1,
@@ -70,9 +78,24 @@ template<> struct ei_packet_traits<float>  : ei_default_packet_traits
   };
 };
 template<> struct ei_packet_traits<double> : ei_default_packet_traits
-{ typedef Packet2d type; enum {size=2}; };
+{
+  typedef Packet2d type;
+  enum {
+    Vectorizable = 1,
+    size=2,
+
+    HasDiv    = 1
+  };
+};
 template<> struct ei_packet_traits<int>    : ei_default_packet_traits
-{ typedef Packet4i type; enum {size=4}; };
+{
+  typedef Packet4i type;
+  enum {
+    // FIXME check the Has*
+    Vectorizable = 1,
+    size=4
+  };
+};
 
 template<> struct ei_unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
 template<> struct ei_unpacket_traits<Packet2d> { typedef double type; enum {size=2}; };
@@ -83,11 +106,11 @@ template<> struct ei_unpacket_traits<Packet4i> { typedef int    type; enum {size
 // that is inefficient :( (e.g., see ei_gemm_pack_rhs)
 template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) {
   Packet4f res = _mm_set_ss(from);
-  return _mm_shuffle_ps(res,res,0);
+  return ei_vec4f_swizzle1(res,0,0,0,0);
 }
 template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double&  from) {
   Packet2d res = _mm_set_sd(from);
-  return _mm_unpacklo_pd(res,res);
+  return ei_vec2d_swizzle1(res, 0, 0);
 }
 #else
 template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) { return _mm_set1_ps(from); }
diff --git a/Eigen/src/Core/products/CoeffBasedProduct.h b/Eigen/src/Core/products/CoeffBasedProduct.h
index a17ce901b..1474bc1bb 100644
--- a/Eigen/src/Core/products/CoeffBasedProduct.h
+++ b/Eigen/src/Core/products/CoeffBasedProduct.h
@@ -318,7 +318,7 @@ struct ei_product_coeff_vectorized_dyn_selector
   typedef typename Lhs::Index Index;
   EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
   {
-    res = lhs.row(row).cwiseProduct(rhs.col(col)).sum();
+    res = lhs.row(row).transpose().cwiseProduct(rhs.col(col)).sum();
   }
 };
 
@@ -330,7 +330,7 @@ struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
   typedef typename Lhs::Index Index;
   EIGEN_STRONG_INLINE static void run(Index /*row*/, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
   {
-    res = lhs.cwiseProduct(rhs.col(col)).sum();
+    res = lhs.transpose().cwiseProduct(rhs.col(col)).sum();
   }
 };
 
@@ -340,7 +340,7 @@ struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
   typedef typename Lhs::Index Index;
   EIGEN_STRONG_INLINE static void run(Index row, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
   {
-    res = lhs.row(row).cwiseProduct(rhs).sum();
+    res = lhs.row(row).transpose().cwiseProduct(rhs).sum();
   }
 };
 
@@ -350,7 +350,7 @@ struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
   typedef typename Lhs::Index Index;
   EIGEN_STRONG_INLINE static void run(Index /*row*/, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
   {
-    res = lhs.cwiseProduct(rhs).sum();
+    res = lhs.transpose().cwiseProduct(rhs).sum();
   }
 };
 
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 2c42ad5b6..cf133f68f 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -134,13 +134,13 @@ inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, st
 }
 
 #ifdef EIGEN_HAS_FUSE_CJMADD
-  #define CJMADD(A,B,C,T)  C = cj.pmadd(A,B,C);
+  #define MADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
 #else
-  #define CJMADD(A,B,C,T)  T = B; T = cj.pmul(A,T); C = ei_padd(C,T);
+  #define MADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = ei_padd(C,T);
 #endif
 
 // optimized GEneral packed Block * packed Panel product kernel
-template<typename Scalar, typename Index, int mr, int nr, typename Conj>
+template<typename Scalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
 struct ei_gebp_kernel
 {
   void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols,
@@ -150,7 +150,8 @@ struct ei_gebp_kernel
     enum { PacketSize = ei_packet_traits<Scalar>::size };
     if(strideA==-1) strideA = depth;
     if(strideB==-1) strideB = depth;
-    Conj cj;
+    ei_conj_helper<Scalar,Scalar,ConjugateLhs,ConjugateRhs> cj;
+    ei_conj_helper<PacketType,PacketType,ConjugateLhs,ConjugateRhs> pcj;
     Index packet_cols = (cols/nr) * nr;
     const Index peeled_mc = (rows/mr)*mr;
     const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0);
@@ -259,42 +260,43 @@ struct ei_gebp_kernel
             #ifndef EIGEN_HAS_FUSE_CJMADD
             PacketType T0;
             #endif
-
+EIGEN_ASM_COMMENT("mybegin");
             A0 = ei_pload(&blA[0*PacketSize]);
             A1 = ei_pload(&blA[1*PacketSize]);
             B0 = ei_pload(&blB[0*PacketSize]);
-            CJMADD(A0,B0,C0,T0);
-            CJMADD(A1,B0,C4,B0);
+            MADD(pcj,A0,B0,C0,T0);
+            MADD(pcj,A1,B0,C4,B0);
             B0 = ei_pload(&blB[1*PacketSize]);
-            CJMADD(A0,B0,C1,T0);
-            CJMADD(A1,B0,C5,B0);
+            MADD(pcj,A0,B0,C1,T0);
+            MADD(pcj,A1,B0,C5,B0);
 
             A0 = ei_pload(&blA[2*PacketSize]);
             A1 = ei_pload(&blA[3*PacketSize]);
             B0 = ei_pload(&blB[2*PacketSize]);
-            CJMADD(A0,B0,C0,T0);
-            CJMADD(A1,B0,C4,B0);
+            MADD(pcj,A0,B0,C0,T0);
+            MADD(pcj,A1,B0,C4,B0);
             B0 = ei_pload(&blB[3*PacketSize]);
-            CJMADD(A0,B0,C1,T0);
-            CJMADD(A1,B0,C5,B0);
+            MADD(pcj,A0,B0,C1,T0);
+            MADD(pcj,A1,B0,C5,B0);
 
             A0 = ei_pload(&blA[4*PacketSize]);
             A1 = ei_pload(&blA[5*PacketSize]);
             B0 = ei_pload(&blB[4*PacketSize]);
-            CJMADD(A0,B0,C0,T0);
-            CJMADD(A1,B0,C4,B0);
+            MADD(pcj,A0,B0,C0,T0);
+            MADD(pcj,A1,B0,C4,B0);
             B0 = ei_pload(&blB[5*PacketSize]);
-            CJMADD(A0,B0,C1,T0);
-            CJMADD(A1,B0,C5,B0);
+            MADD(pcj,A0,B0,C1,T0);
+            MADD(pcj,A1,B0,C5,B0);
 
             A0 = ei_pload(&blA[6*PacketSize]);
             A1 = ei_pload(&blA[7*PacketSize]);
             B0 = ei_pload(&blB[6*PacketSize]);
-            CJMADD(A0,B0,C0,T0);
-            CJMADD(A1,B0,C4,B0);
+            MADD(pcj,A0,B0,C0,T0);
+            MADD(pcj,A1,B0,C4,B0);
             B0 = ei_pload(&blB[7*PacketSize]);
-            CJMADD(A0,B0,C1,T0);
-            CJMADD(A1,B0,C5,B0);
+            MADD(pcj,A0,B0,C1,T0);
+            MADD(pcj,A1,B0,C5,B0);
+EIGEN_ASM_COMMENT("myend");
           }
           else
           {
@@ -302,65 +304,66 @@ struct ei_gebp_kernel
             #ifndef EIGEN_HAS_FUSE_CJMADD
             PacketType T0;
             #endif
-
+EIGEN_ASM_COMMENT("mybegin");
             A0 = ei_pload(&blA[0*PacketSize]);
             A1 = ei_pload(&blA[1*PacketSize]);
             B0 = ei_pload(&blB[0*PacketSize]);
             B1 = ei_pload(&blB[1*PacketSize]);
 
-            CJMADD(A0,B0,C0,T0);
+            MADD(pcj,A0,B0,C0,T0);
             B2 = ei_pload(&blB[2*PacketSize]);
-            CJMADD(A1,B0,C4,B0);
+            MADD(pcj,A1,B0,C4,B0);
             B3 = ei_pload(&blB[3*PacketSize]);
             B0 = ei_pload(&blB[4*PacketSize]);
-            CJMADD(A0,B1,C1,T0);
-            CJMADD(A1,B1,C5,B1);
+            MADD(pcj,A0,B1,C1,T0);
+            MADD(pcj,A1,B1,C5,B1);
             B1 = ei_pload(&blB[5*PacketSize]);
-            CJMADD(A0,B2,C2,T0);
-            CJMADD(A1,B2,C6,B2);
+            MADD(pcj,A0,B2,C2,T0);
+            MADD(pcj,A1,B2,C6,B2);
             B2 = ei_pload(&blB[6*PacketSize]);
-            CJMADD(A0,B3,C3,T0);
+            MADD(pcj,A0,B3,C3,T0);
             A0 = ei_pload(&blA[2*PacketSize]);
-            CJMADD(A1,B3,C7,B3);
+            MADD(pcj,A1,B3,C7,B3);
             A1 = ei_pload(&blA[3*PacketSize]);
             B3 = ei_pload(&blB[7*PacketSize]);
-            CJMADD(A0,B0,C0,T0);
-            CJMADD(A1,B0,C4,B0);
+            MADD(pcj,A0,B0,C0,T0);
+            MADD(pcj,A1,B0,C4,B0);
             B0 = ei_pload(&blB[8*PacketSize]);
-            CJMADD(A0,B1,C1,T0);
-            CJMADD(A1,B1,C5,B1);
+            MADD(pcj,A0,B1,C1,T0);
+            MADD(pcj,A1,B1,C5,B1);
             B1 = ei_pload(&blB[9*PacketSize]);
-            CJMADD(A0,B2,C2,T0);
-            CJMADD(A1,B2,C6,B2);
+            MADD(pcj,A0,B2,C2,T0);
+            MADD(pcj,A1,B2,C6,B2);
             B2 = ei_pload(&blB[10*PacketSize]);
-            CJMADD(A0,B3,C3,T0);
+            MADD(pcj,A0,B3,C3,T0);
             A0 = ei_pload(&blA[4*PacketSize]);
-            CJMADD(A1,B3,C7,B3);
+            MADD(pcj,A1,B3,C7,B3);
             A1 = ei_pload(&blA[5*PacketSize]);
             B3 = ei_pload(&blB[11*PacketSize]);
 
-            CJMADD(A0,B0,C0,T0);
-            CJMADD(A1,B0,C4,B0);
+            MADD(pcj,A0,B0,C0,T0);
+            MADD(pcj,A1,B0,C4,B0);
             B0 = ei_pload(&blB[12*PacketSize]);
-            CJMADD(A0,B1,C1,T0);
-            CJMADD(A1,B1,C5,B1);
+            MADD(pcj,A0,B1,C1,T0);
+            MADD(pcj,A1,B1,C5,B1);
             B1 = ei_pload(&blB[13*PacketSize]);
-            CJMADD(A0,B2,C2,T0);
-            CJMADD(A1,B2,C6,B2);
+            MADD(pcj,A0,B2,C2,T0);
+            MADD(pcj,A1,B2,C6,B2);
             B2 = ei_pload(&blB[14*PacketSize]);
-            CJMADD(A0,B3,C3,T0);
+            MADD(pcj,A0,B3,C3,T0);
             A0 = ei_pload(&blA[6*PacketSize]);
-            CJMADD(A1,B3,C7,B3);
+            MADD(pcj,A1,B3,C7,B3);
             A1 = ei_pload(&blA[7*PacketSize]);
             B3 = ei_pload(&blB[15*PacketSize]);
-            CJMADD(A0,B0,C0,T0);
-            CJMADD(A1,B0,C4,B0);
-            CJMADD(A0,B1,C1,T0);
-            CJMADD(A1,B1,C5,B1);
-            CJMADD(A0,B2,C2,T0);
-            CJMADD(A1,B2,C6,B2);
-            CJMADD(A0,B3,C3,T0);
-            CJMADD(A1,B3,C7,B3);
+            MADD(pcj,A0,B0,C0,T0);
+            MADD(pcj,A1,B0,C4,B0);
+            MADD(pcj,A0,B1,C1,T0);
+            MADD(pcj,A1,B1,C5,B1);
+            MADD(pcj,A0,B2,C2,T0);
+            MADD(pcj,A1,B2,C6,B2);
+            MADD(pcj,A0,B3,C3,T0);
+            MADD(pcj,A1,B3,C7,B3);
+EIGEN_ASM_COMMENT("myend");
           }
 
           blB += 4*nr*PacketSize;
@@ -379,11 +382,11 @@ struct ei_gebp_kernel
             A0 = ei_pload(&blA[0*PacketSize]);
             A1 = ei_pload(&blA[1*PacketSize]);
             B0 = ei_pload(&blB[0*PacketSize]);
-            CJMADD(A0,B0,C0,T0);
-            CJMADD(A1,B0,C4,B0);
+            MADD(pcj,A0,B0,C0,T0);
+            MADD(pcj,A1,B0,C4,B0);
             B0 = ei_pload(&blB[1*PacketSize]);
-            CJMADD(A0,B0,C1,T0);
-            CJMADD(A1,B0,C5,B0);
+            MADD(pcj,A0,B0,C1,T0);
+            MADD(pcj,A1,B0,C5,B0);
           }
           else
           {
@@ -397,16 +400,16 @@ struct ei_gebp_kernel
             B0 = ei_pload(&blB[0*PacketSize]);
             B1 = ei_pload(&blB[1*PacketSize]);
 
-            CJMADD(A0,B0,C0,T0);
+            MADD(pcj,A0,B0,C0,T0);
             B2 = ei_pload(&blB[2*PacketSize]);
-            CJMADD(A1,B0,C4,B0);
+            MADD(pcj,A1,B0,C4,B0);
             B3 = ei_pload(&blB[3*PacketSize]);
-            CJMADD(A0,B1,C1,T0);
-            CJMADD(A1,B1,C5,B1);
-            CJMADD(A0,B2,C2,T0);
-            CJMADD(A1,B2,C6,B2);
-            CJMADD(A0,B3,C3,T0);
-            CJMADD(A1,B3,C7,B3);
+            MADD(pcj,A0,B1,C1,T0);
+            MADD(pcj,A1,B1,C5,B1);
+            MADD(pcj,A0,B2,C2,T0);
+            MADD(pcj,A1,B2,C6,B2);
+            MADD(pcj,A0,B3,C3,T0);
+            MADD(pcj,A1,B3,C7,B3);
           }
 
           blB += nr*PacketSize;
@@ -466,23 +469,23 @@ struct ei_gebp_kernel
             A0 = ei_pload(&blA[0*PacketSize]);
             B0 = ei_pload(&blB[0*PacketSize]);
             B1 = ei_pload(&blB[1*PacketSize]);
-            CJMADD(A0,B0,C0,B0);
+            MADD(pcj,A0,B0,C0,B0);
             B0 = ei_pload(&blB[2*PacketSize]);
-            CJMADD(A0,B1,C1,B1);
+            MADD(pcj,A0,B1,C1,B1);
             A0 = ei_pload(&blA[1*PacketSize]);
             B1 = ei_pload(&blB[3*PacketSize]);
-            CJMADD(A0,B0,C0,B0);
+            MADD(pcj,A0,B0,C0,B0);
             B0 = ei_pload(&blB[4*PacketSize]);
-            CJMADD(A0,B1,C1,B1);
+            MADD(pcj,A0,B1,C1,B1);
             A0 = ei_pload(&blA[2*PacketSize]);
             B1 = ei_pload(&blB[5*PacketSize]);
-            CJMADD(A0,B0,C0,B0);
+            MADD(pcj,A0,B0,C0,B0);
             B0 = ei_pload(&blB[6*PacketSize]);
-            CJMADD(A0,B1,C1,B1);
+            MADD(pcj,A0,B1,C1,B1);
             A0 = ei_pload(&blA[3*PacketSize]);
             B1 = ei_pload(&blB[7*PacketSize]);
-            CJMADD(A0,B0,C0,B0);
-            CJMADD(A0,B1,C1,B1);
+            MADD(pcj,A0,B0,C0,B0);
+            MADD(pcj,A0,B1,C1,B1);
           }
           else
           {
@@ -492,41 +495,41 @@ struct ei_gebp_kernel
             B0 = ei_pload(&blB[0*PacketSize]);
             B1 = ei_pload(&blB[1*PacketSize]);
 
-            CJMADD(A0,B0,C0,B0);
+            MADD(pcj,A0,B0,C0,B0);
             B2 = ei_pload(&blB[2*PacketSize]);
             B3 = ei_pload(&blB[3*PacketSize]);
             B0 = ei_pload(&blB[4*PacketSize]);
-            CJMADD(A0,B1,C1,B1);
+            MADD(pcj,A0,B1,C1,B1);
             B1 = ei_pload(&blB[5*PacketSize]);
-            CJMADD(A0,B2,C2,B2);
+            MADD(pcj,A0,B2,C2,B2);
             B2 = ei_pload(&blB[6*PacketSize]);
-            CJMADD(A0,B3,C3,B3);
+            MADD(pcj,A0,B3,C3,B3);
             A0 = ei_pload(&blA[1*PacketSize]);
             B3 = ei_pload(&blB[7*PacketSize]);
-            CJMADD(A0,B0,C0,B0);
+            MADD(pcj,A0,B0,C0,B0);
             B0 = ei_pload(&blB[8*PacketSize]);
-            CJMADD(A0,B1,C1,B1);
+            MADD(pcj,A0,B1,C1,B1);
             B1 = ei_pload(&blB[9*PacketSize]);
-            CJMADD(A0,B2,C2,B2);
+            MADD(pcj,A0,B2,C2,B2);
             B2 = ei_pload(&blB[10*PacketSize]);
-            CJMADD(A0,B3,C3,B3);
+            MADD(pcj,A0,B3,C3,B3);
             A0 = ei_pload(&blA[2*PacketSize]);
             B3 = ei_pload(&blB[11*PacketSize]);
 
-            CJMADD(A0,B0,C0,B0);
+            MADD(pcj,A0,B0,C0,B0);
             B0 = ei_pload(&blB[12*PacketSize]);
-            CJMADD(A0,B1,C1,B1);
+            MADD(pcj,A0,B1,C1,B1);
             B1 = ei_pload(&blB[13*PacketSize]);
-            CJMADD(A0,B2,C2,B2);
+            MADD(pcj,A0,B2,C2,B2);
             B2 = ei_pload(&blB[14*PacketSize]);
-            CJMADD(A0,B3,C3,B3);
+            MADD(pcj,A0,B3,C3,B3);
 
             A0 = ei_pload(&blA[3*PacketSize]);
             B3 = ei_pload(&blB[15*PacketSize]);
-            CJMADD(A0,B0,C0,B0);
-            CJMADD(A0,B1,C1,B1);
-            CJMADD(A0,B2,C2,B2);
-            CJMADD(A0,B3,C3,B3);
+            MADD(pcj,A0,B0,C0,B0);
+            MADD(pcj,A0,B1,C1,B1);
+            MADD(pcj,A0,B2,C2,B2);
+            MADD(pcj,A0,B3,C3,B3);
           }
 
           blB += 4*nr*PacketSize;
@@ -544,9 +547,9 @@ struct ei_gebp_kernel
 
             A0 = ei_pload(&blA[0*PacketSize]);
             B0 = ei_pload(&blB[0*PacketSize]);
-            CJMADD(A0,B0,C0,T0);
+            MADD(pcj,A0,B0,C0,T0);
             B0 = ei_pload(&blB[1*PacketSize]);
-            CJMADD(A0,B0,C1,T0);
+            MADD(pcj,A0,B0,C1,T0);
           }
           else
           {
@@ -561,10 +564,10 @@ struct ei_gebp_kernel
             B2 = ei_pload(&blB[2*PacketSize]);
             B3 = ei_pload(&blB[3*PacketSize]);
 
-            CJMADD(A0,B0,C0,T0);
-            CJMADD(A0,B1,C1,T1);
-            CJMADD(A0,B2,C2,T0);
-            CJMADD(A0,B3,C3,T1);
+            MADD(pcj,A0,B0,C0,T0);
+            MADD(pcj,A0,B1,C1,T1);
+            MADD(pcj,A0,B2,C2,T0);
+            MADD(pcj,A0,B3,C3,T1);
           }
 
           blB += nr*PacketSize;
@@ -596,9 +599,9 @@ struct ei_gebp_kernel
 
             A0 = blA[k];
             B0 = blB[0*PacketSize];
-            CJMADD(A0,B0,C0,T0);
+            MADD(cj,A0,B0,C0,T0);
             B0 = blB[1*PacketSize];
-            CJMADD(A0,B0,C1,T0);
+            MADD(cj,A0,B0,C1,T0);
           }
           else
           {
@@ -613,10 +616,10 @@ struct ei_gebp_kernel
             B2 = blB[2*PacketSize];
             B3 = blB[3*PacketSize];
 
-            CJMADD(A0,B0,C0,T0);
-            CJMADD(A0,B1,C1,T1);
-            CJMADD(A0,B2,C2,T0);
-            CJMADD(A0,B3,C3,T1);
+            MADD(cj,A0,B0,C0,T0);
+            MADD(cj,A0,B1,C1,T1);
+            MADD(cj,A0,B2,C2,T0);
+            MADD(cj,A0,B3,C3,T1);
           }
 
           blB += nr*PacketSize;
@@ -662,8 +665,8 @@ struct ei_gebp_kernel
           A0 = ei_pload(&blA[0*PacketSize]);
           A1 = ei_pload(&blA[1*PacketSize]);
           B0 = ei_pload(&blB[0*PacketSize]);
-          CJMADD(A0,B0,C0,T0);
-          CJMADD(A1,B0,C4,T1);
+          MADD(pcj,A0,B0,C0,T0);
+          MADD(pcj,A1,B0,C4,T1);
 
           blB += PacketSize;
           blA += mr;
@@ -683,7 +686,8 @@ struct ei_gebp_kernel
         const Scalar* blB = unpackedB;
         for(Index k=0; k<depth; k++)
         {
-          C0 = cj.pmadd(ei_pload(blA), ei_pload(blB), C0);
+          PacketType T0;
+          MADD(pcj,ei_pload(blA), ei_pload(blB), C0, T0);
           blB += PacketSize;
           blA += PacketSize;
         }
@@ -700,7 +704,12 @@ struct ei_gebp_kernel
         // FIXME directly use blockB ??
         const Scalar* blB = unpackedB;
         for(Index k=0; k<depth; k++)
-          C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0);
+        {
+          #ifndef EIGEN_HAS_FUSE_CJMADD
+          Scalar T0;
+          #endif
+          MADD(cj,blA[k], blB[k*PacketSize], C0, T0);
+        }
         res[(j2+0)*resStride + i] += C0;
       }
     }
@@ -769,8 +778,8 @@ struct ei_gemm_pack_lhs
 //  4  5  6  7   16 17 18 19   25 28
 //  8  9 10 11   20 21 22 23   26 29
 //  .  .  .  .    .  .  .  .    .  .
-template<typename Scalar, typename Index, int nr, bool PanelMode>
-struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, PanelMode>
+template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
+struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
 {
   typedef typename ei_packet_traits<Scalar>::type Packet;
   enum { PacketSize = ei_packet_traits<Scalar>::size };
@@ -778,6 +787,7 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, PanelMode>
                   Index stride=0, Index offset=0)
   {
     ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+    ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
     bool hasAlpha = alpha != Scalar(1);
     Index packet_cols = (cols/nr) * nr;
     Index count = 0;
@@ -792,19 +802,19 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, PanelMode>
       if (hasAlpha)
         for(Index k=0; k<depth; k++)
         {
-                    blockB[count+0] = alpha*b0[k];
-                    blockB[count+1] = alpha*b1[k];
-          if(nr==4) blockB[count+2] = alpha*b2[k];
-          if(nr==4) blockB[count+3] = alpha*b3[k];
+                    blockB[count+0] = alpha*cj(b0[k]);
+                    blockB[count+1] = alpha*cj(b1[k]);
+          if(nr==4) blockB[count+2] = alpha*cj(b2[k]);
+          if(nr==4) blockB[count+3] = alpha*cj(b3[k]);
           count += nr;
         }
       else
         for(Index k=0; k<depth; k++)
         {
-                    blockB[count+0] = b0[k];
-                    blockB[count+1] = b1[k];
-          if(nr==4) blockB[count+2] = b2[k];
-          if(nr==4) blockB[count+3] = b3[k];
+                    blockB[count+0] = cj(b0[k]);
+                    blockB[count+1] = cj(b1[k]);
+          if(nr==4) blockB[count+2] = cj(b2[k]);
+          if(nr==4) blockB[count+3] = cj(b3[k]);
           count += nr;
         }
       // skip what we have after
@@ -819,13 +829,13 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, PanelMode>
       if (hasAlpha)
         for(Index k=0; k<depth; k++)
         {
-          blockB[count] = alpha*b0[k];
+          blockB[count] = alpha*cj(b0[k]);
           count += 1;
         }
       else
         for(Index k=0; k<depth; k++)
         {
-          blockB[count] = b0[k];
+          blockB[count] = cj(b0[k]);
           count += 1;
         }
       if(PanelMode) count += (stride-offset-depth);
@@ -834,14 +844,15 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, ColMajor, PanelMode>
 };
 
 // this version is optimized for row major matrices
-template<typename Scalar, typename Index, int nr, bool PanelMode>
-struct ei_gemm_pack_rhs<Scalar, Index, nr, RowMajor, PanelMode>
+template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
+struct ei_gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
 {
   enum { PacketSize = ei_packet_traits<Scalar>::size };
   void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Scalar alpha, Index depth, Index cols,
                   Index stride=0, Index offset=0)
   {
     ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+    ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
     bool hasAlpha = alpha != Scalar(1);
     Index packet_cols = (cols/nr) * nr;
     Index count = 0;
@@ -854,10 +865,10 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, RowMajor, PanelMode>
         for(Index k=0; k<depth; k++)
         {
           const Scalar* b0 = &rhs[k*rhsStride + j2];
-                    blockB[count+0] = alpha*b0[0];
-                    blockB[count+1] = alpha*b0[1];
-          if(nr==4) blockB[count+2] = alpha*b0[2];
-          if(nr==4) blockB[count+3] = alpha*b0[3];
+                    blockB[count+0] = alpha*cj(b0[0]);
+                    blockB[count+1] = alpha*cj(b0[1]);
+          if(nr==4) blockB[count+2] = alpha*cj(b0[2]);
+          if(nr==4) blockB[count+3] = alpha*cj(b0[3]);
           count += nr;
         }
       }
@@ -866,10 +877,10 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, RowMajor, PanelMode>
         for(Index k=0; k<depth; k++)
         {
           const Scalar* b0 = &rhs[k*rhsStride + j2];
-                    blockB[count+0] = b0[0];
-                    blockB[count+1] = b0[1];
-          if(nr==4) blockB[count+2] = b0[2];
-          if(nr==4) blockB[count+3] = b0[3];
+                    blockB[count+0] = cj(b0[0]);
+                    blockB[count+1] = cj(b0[1]);
+          if(nr==4) blockB[count+2] = cj(b0[2]);
+          if(nr==4) blockB[count+3] = cj(b0[3]);
           count += nr;
         }
       }
@@ -883,7 +894,7 @@ struct ei_gemm_pack_rhs<Scalar, Index, nr, RowMajor, PanelMode>
       const Scalar* b0 = &rhs[j2];
       for(Index k=0; k<depth; k++)
       {
-        blockB[count] = alpha*b0[k*rhsStride];
+        blockB[count] = alpha*cj(b0[k*rhsStride]);
         count += 1;
       }
       if(PanelMode) count += stride-offset-depth;
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 39b283a3f..2ae78c1e7 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -73,9 +73,6 @@ static void run(Index rows, Index cols, Index depth,
   ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
   ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
 
-  if (ConjugateRhs)
-    alpha = ei_conj(alpha);
-
   typedef typename ei_packet_traits<Scalar>::type PacketType;
   typedef ei_product_blocking_traits<Scalar> Blocking;
 
@@ -83,9 +80,18 @@ static void run(Index rows, Index cols, Index depth,
   Index mc = std::min(rows,blocking.mc());  // cache block size along the M direction
   //Index nc = blocking.nc(); // cache block size along the N direction
 
-  ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs;
-  ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs;
-  ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp;
+  // FIXME starting from SSE3, normal complex product cannot be optimized as well as
+  // conjugate product, therefore it is better to conjugate during the copies.
+  // With SSE2, this is the other way round.
+  ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, LhsStorageOrder, ConjugateLhs> pack_lhs;
+  ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, RhsStorageOrder, ConjugateRhs> pack_rhs;
+  ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr> gebp;
+
+//   if (ConjugateRhs)
+//     alpha = ei_conj(alpha);
+//   ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs;
+//   ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs;
+//   ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp;
 
 #ifdef EIGEN_HAS_OPENMP
   if(info)
@@ -237,7 +243,7 @@ struct ei_gemm_functor
   {
     if(cols==-1)
       cols = m_rhs.cols();
-      
+
     Gemm::run(rows, cols, m_lhs.cols(),
               (const Scalar*)&(m_lhs.const_cast_derived().coeffRef(row,0)), m_lhs.outerStride(),
               (const Scalar*)&(m_rhs.const_cast_derived().coeffRef(0,col)), m_rhs.outerStride(),
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index 5d8da247c..e671a657e 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -48,18 +48,22 @@ void ei_cache_friendly_product_colmajor_times_vector(
     ei_pstore(&res[j], \
       ei_padd(ei_pload(&res[j]), \
         ei_padd( \
-          ei_padd(cj.pmul(EIGEN_CAT(ei_ploa , A0)(&lhs0[j]),    ptmp0), \
-                  cj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs1[j]),   ptmp1)), \
-          ei_padd(cj.pmul(EIGEN_CAT(ei_ploa , A2)(&lhs2[j]),    ptmp2), \
-                  cj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs3[j]),   ptmp3)) )))
-
-  ei_conj_helper<ConjugateLhs,ConjugateRhs> cj;
-  if(ConjugateRhs)
-    alpha = ei_conj(alpha);
+          ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A0)(&lhs0[j]),    ptmp0), \
+                  pcj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs1[j]),   ptmp1)), \
+          ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A2)(&lhs2[j]),    ptmp2), \
+                  pcj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs3[j]),   ptmp3)) )))
 
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef typename ei_packet_traits<Scalar>::type Packet;
-  const Index PacketSize = sizeof(Packet)/sizeof(Scalar);
+  enum {
+    PacketSize = sizeof(Packet)/sizeof(Scalar),
+    Vectorizable = ei_packet_traits<Scalar>::Vectorizable
+  };
+
+  ei_conj_helper<Scalar,Scalar,ConjugateLhs,ConjugateRhs> cj;
+  ei_conj_helper<Packet,Packet,ConjugateLhs,ConjugateRhs> pcj;
+  if(ConjugateRhs)
+    alpha = ei_conj(alpha);
 
   enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
   const Index columnsAtOnce = 4;
@@ -84,7 +88,7 @@ void ei_cache_friendly_product_colmajor_times_vector(
   // find how many columns do we have to skip to be aligned with the result (if possible)
   Index skipColumns = 0;
   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (size_t(lhs)%sizeof(RealScalar)) || (size_t(res)%sizeof(RealScalar)) )
+  if( (size_t(lhs)%sizeof(Scalar)) || (size_t(res)%sizeof(Scalar)) )
   {
     alignedSize = 0;
     alignedStart = 0;
@@ -113,6 +117,12 @@ void ei_cache_friendly_product_colmajor_times_vector(
                       || PacketSize > size
                       || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0);
   }
+  else if(Vectorizable)
+  {
+    alignedStart = 0;
+    alignedSize = size;
+    alignmentPattern = AllAligned;
+  }
 
   Index offset1 = (FirstAligned && alignmentStep==1?3:1);
   Index offset3 = (FirstAligned && alignmentStep==1?1:3);
@@ -127,7 +137,7 @@ void ei_cache_friendly_product_colmajor_times_vector(
     const Scalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
                  *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
 
-    if (PacketSize>1)
+    if (Vectorizable)
     {
       /* explicit vectorization */
       // process initial unaligned coeffs
@@ -168,19 +178,19 @@ void ei_cache_friendly_product_colmajor_times_vector(
 
                 A00 = ei_pload (&lhs0[j]);
                 A10 = ei_pload (&lhs0[j+PacketSize]);
-                A00 = cj.pmadd(A00, ptmp0, ei_pload(&res[j]));
-                A10 = cj.pmadd(A10, ptmp0, ei_pload(&res[j+PacketSize]));
+                A00 = pcj.pmadd(A00, ptmp0, ei_pload(&res[j]));
+                A10 = pcj.pmadd(A10, ptmp0, ei_pload(&res[j+PacketSize]));
 
-                A00 = cj.pmadd(A01, ptmp1, A00);
+                A00 = pcj.pmadd(A01, ptmp1, A00);
                 A01 = ei_pload(&lhs1[j-1+2*PacketSize]);  ei_palign<1>(A11,A01);
-                A00 = cj.pmadd(A02, ptmp2, A00);
+                A00 = pcj.pmadd(A02, ptmp2, A00);
                 A02 = ei_pload(&lhs2[j-2+2*PacketSize]);  ei_palign<2>(A12,A02);
-                A00 = cj.pmadd(A03, ptmp3, A00);
+                A00 = pcj.pmadd(A03, ptmp3, A00);
                 ei_pstore(&res[j],A00);
                 A03 = ei_pload(&lhs3[j-3+2*PacketSize]);  ei_palign<3>(A13,A03);
-                A10 = cj.pmadd(A11, ptmp1, A10);
-                A10 = cj.pmadd(A12, ptmp2, A10);
-                A10 = cj.pmadd(A13, ptmp3, A10);
+                A10 = pcj.pmadd(A11, ptmp1, A10);
+                A10 = pcj.pmadd(A12, ptmp2, A10);
+                A10 = pcj.pmadd(A13, ptmp3, A10);
                 ei_pstore(&res[j+PacketSize],A10);
               }
             }
@@ -215,7 +225,7 @@ void ei_cache_friendly_product_colmajor_times_vector(
       Packet ptmp0 = ei_pset1(alpha*rhs[i]);
       const Scalar* lhs0 = lhs + i*lhsStride;
 
-      if (PacketSize>1)
+      if (Vectorizable)
       {
         /* explicit vectorization */
         // process first unaligned result's coeffs
@@ -225,10 +235,10 @@ void ei_cache_friendly_product_colmajor_times_vector(
         // process aligned result's coeffs
         if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0)
           for (Index j = alignedStart;j<alignedSize;j+=PacketSize)
-            ei_pstore(&res[j], cj.pmadd(ei_pload(&lhs0[j]), ptmp0, ei_pload(&res[j])));
+            ei_pstore(&res[j], pcj.pmadd(ei_pload(&lhs0[j]), ptmp0, ei_pload(&res[j])));
         else
           for (Index j = alignedStart;j<alignedSize;j+=PacketSize)
-            ei_pstore(&res[j], cj.pmadd(ei_ploadu(&lhs0[j]), ptmp0, ei_pload(&res[j])));
+            ei_pstore(&res[j], pcj.pmadd(ei_ploadu(&lhs0[j]), ptmp0, ei_pload(&res[j])));
       }
 
       // process remaining scalars (or all if no explicit vectorization)
@@ -243,7 +253,7 @@ void ei_cache_friendly_product_colmajor_times_vector(
     }
     else
       break;
-  } while(PacketSize>1);
+  } while(Vectorizable);
   #undef _EIGEN_ACCUMULATE_PACKETS
 }
 
@@ -261,16 +271,20 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
 
   #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
     Packet b = ei_pload(&rhs[j]); \
-    ptmp0 = cj.pmadd(EIGEN_CAT(ei_ploa,A0) (&lhs0[j]), b, ptmp0); \
-    ptmp1 = cj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs1[j]), b, ptmp1); \
-    ptmp2 = cj.pmadd(EIGEN_CAT(ei_ploa,A2) (&lhs2[j]), b, ptmp2); \
-    ptmp3 = cj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs3[j]), b, ptmp3); }
-
-  ei_conj_helper<ConjugateLhs,ConjugateRhs> cj;
+    ptmp0 = pcj.pmadd(EIGEN_CAT(ei_ploa,A0) (&lhs0[j]), b, ptmp0); \
+    ptmp1 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs1[j]), b, ptmp1); \
+    ptmp2 = pcj.pmadd(EIGEN_CAT(ei_ploa,A2) (&lhs2[j]), b, ptmp2); \
+    ptmp3 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs3[j]), b, ptmp3); }
 
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef typename ei_packet_traits<Scalar>::type Packet;
-  const Index PacketSize = sizeof(Packet)/sizeof(Scalar);
+  enum {
+    PacketSize = sizeof(Packet)/sizeof(Scalar),
+    Vectorizable = ei_packet_traits<Scalar>::Vectorizable
+  };
+
+  ei_conj_helper<Scalar,Scalar,ConjugateLhs,ConjugateRhs> cj;
+  ei_conj_helper<Packet,Packet,ConjugateLhs,ConjugateRhs> pcj;
 
   enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
   const Index rowsAtOnce = 4;
@@ -297,7 +311,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
   // find how many rows do we have to skip to be aligned with rhs (if possible)
   Index skipRows = 0;
   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (size_t(lhs)%sizeof(RealScalar)) || (size_t(rhs)%sizeof(RealScalar)) )
+  if( (size_t(lhs)%sizeof(Scalar)) || (size_t(rhs)%sizeof(Scalar)) )
   {
     alignedSize = 0;
     alignedStart = 0;
@@ -326,6 +340,12 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
                       || PacketSize > rhsSize
                       || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0);
   }
+  else if(Vectorizable)
+  {
+    alignedStart = 0;
+    alignedSize = size;
+    alignmentPattern = AllAligned;
+  }
 
   Index offset1 = (FirstAligned && alignmentStep==1?3:1);
   Index offset3 = (FirstAligned && alignmentStep==1?1:3);
@@ -333,13 +353,14 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
   Index rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
   for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
   {
-    Scalar tmp0 = Scalar(0), tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0);
+    EIGEN_ALIGN16 Scalar tmp0 = Scalar(0);
+    Scalar tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0);
 
     // this helps the compiler generating good binary code
     const Scalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
                  *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
 
-    if (PacketSize>1)
+    if (Vectorizable)
     {
       /* explicit vectorization */
       Packet ptmp0 = ei_pset1(Scalar(0)), ptmp1 = ei_pset1(Scalar(0)), ptmp2 = ei_pset1(Scalar(0)), ptmp3 = ei_pset1(Scalar(0));
@@ -386,19 +407,19 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
                 A12 = ei_pload(&lhs2[j-2+PacketSize]);  ei_palign<2>(A02,A12);
                 A13 = ei_pload(&lhs3[j-3+PacketSize]);  ei_palign<3>(A03,A13);
 
-                ptmp0 = cj.pmadd(ei_pload (&lhs0[j]), b, ptmp0);
-                ptmp1 = cj.pmadd(A01, b, ptmp1);
+                ptmp0 = pcj.pmadd(ei_pload (&lhs0[j]), b, ptmp0);
+                ptmp1 = pcj.pmadd(A01, b, ptmp1);
                 A01 = ei_pload(&lhs1[j-1+2*PacketSize]);  ei_palign<1>(A11,A01);
-                ptmp2 = cj.pmadd(A02, b, ptmp2);
+                ptmp2 = pcj.pmadd(A02, b, ptmp2);
                 A02 = ei_pload(&lhs2[j-2+2*PacketSize]);  ei_palign<2>(A12,A02);
-                ptmp3 = cj.pmadd(A03, b, ptmp3);
+                ptmp3 = pcj.pmadd(A03, b, ptmp3);
                 A03 = ei_pload(&lhs3[j-3+2*PacketSize]);  ei_palign<3>(A13,A03);
 
                 b = ei_pload(&rhs[j+PacketSize]);
-                ptmp0 = cj.pmadd(ei_pload (&lhs0[j+PacketSize]), b, ptmp0);
-                ptmp1 = cj.pmadd(A11, b, ptmp1);
-                ptmp2 = cj.pmadd(A12, b, ptmp2);
-                ptmp3 = cj.pmadd(A13, b, ptmp3);
+                ptmp0 = pcj.pmadd(ei_pload (&lhs0[j+PacketSize]), b, ptmp0);
+                ptmp1 = pcj.pmadd(A11, b, ptmp1);
+                ptmp2 = pcj.pmadd(A12, b, ptmp2);
+                ptmp3 = pcj.pmadd(A13, b, ptmp3);
               }
             }
             for (Index j = peeledSize; j<alignedSize; j+=PacketSize)
@@ -434,7 +455,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
   {
     for (Index i=start; i<end; ++i)
     {
-      Scalar tmp0 = Scalar(0);
+      EIGEN_ALIGN16 Scalar tmp0 = Scalar(0);
       Packet ptmp0 = ei_pset1(tmp0);
       const Scalar* lhs0 = lhs + i*lhsStride;
       // process first unaligned result's coeffs
@@ -447,10 +468,10 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
         // process aligned rhs coeffs
         if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0)
           for (Index j = alignedStart;j<alignedSize;j+=PacketSize)
-            ptmp0 = cj.pmadd(ei_pload(&lhs0[j]), ei_pload(&rhs[j]), ptmp0);
+            ptmp0 = pcj.pmadd(ei_pload(&lhs0[j]), ei_pload(&rhs[j]), ptmp0);
         else
           for (Index j = alignedStart;j<alignedSize;j+=PacketSize)
-            ptmp0 = cj.pmadd(ei_ploadu(&lhs0[j]), ei_pload(&rhs[j]), ptmp0);
+            ptmp0 = pcj.pmadd(ei_ploadu(&lhs0[j]), ei_pload(&rhs[j]), ptmp0);
         tmp0 += ei_predux(ptmp0);
       }
 
@@ -468,7 +489,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
     }
     else
       break;
-  } while(PacketSize>1);
+  } while(Vectorizable);
 
   #undef _EIGEN_ACCUMULATE_PACKETS
 }
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 24b711ced..d8fa1bd9c 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -270,7 +270,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,Conjugate
     Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
     Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
 
-    ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel;
+    ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
     ei_symm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs;
     ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs;
     ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
@@ -353,7 +353,7 @@ struct ei_product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,Conjugat
     Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
     Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
 
-    ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel;
+    ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
     ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs;
     ei_symm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs;
 
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h
index d6933adb6..4514c7692 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -46,8 +46,11 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector(
     FirstTriangular = IsRowMajor == IsLower
   };
 
-  ei_conj_helper<NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs,  IsRowMajor), ConjugateRhs> cj0;
-  ei_conj_helper<NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1;
+  ei_conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs,  IsRowMajor), ConjugateRhs> cj0;
+  ei_conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1;
+
+  ei_conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs,  IsRowMajor), ConjugateRhs> pcj0;
+  ei_conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1;
 
   Scalar cjAlpha = ConjugateRhs ? ei_conj(alpha) : alpha;
 
@@ -121,9 +124,9 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector(
       Packet Bi  = ei_ploadu(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases
       Packet Xi  = ei_pload (resIt);
 
-      Xi    = cj0.pmadd(A0i,ptmp0, cj0.pmadd(A1i,ptmp1,Xi));
-      ptmp2 = cj1.pmadd(A0i,  Bi, ptmp2);
-      ptmp3 = cj1.pmadd(A1i,  Bi, ptmp3);
+      Xi    = pcj0.pmadd(A0i,ptmp0, pcj0.pmadd(A1i,ptmp1,Xi));
+      ptmp2 = pcj1.pmadd(A0i,  Bi, ptmp2);
+      ptmp3 = pcj1.pmadd(A1i,  Bi, ptmp3);
       ei_pstore(resIt,Xi); resIt += PacketSize;
     }
     for (size_t i=alignedEnd; i<endi; i++)
diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h
index eaf634de3..40c0c9aac 100644
--- a/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/Eigen/src/Core/products/SelfadjointProduct.h
@@ -32,7 +32,7 @@
 **********************************************************************/
 
 // forward declarations (defined at the end of this file)
-template<typename Scalar, typename Index, int mr, int nr, typename Conj, int UpLo>
+template<typename Scalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
 struct ei_sybb_kernel;
 
 /* Optimized selfadjoint product (_SYRK) */
@@ -84,12 +84,15 @@ struct ei_selfadjoint_product<Scalar, Index, MatStorageOrder, ColMajor, AAT, UpL
     Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
 
     // note that the actual rhs is the transpose/adjoint of mat
-    typedef ei_conj_helper<NumTraits<Scalar>::IsComplex && !AAT, NumTraits<Scalar>::IsComplex && AAT> Conj;
+    enum {
+      ConjLhs = NumTraits<Scalar>::IsComplex && !AAT,
+      ConjRhs = NumTraits<Scalar>::IsComplex && AAT
+    };
 
-    ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, Conj> gebp_kernel;
+    ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjLhs, ConjRhs> gebp_kernel;
     ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor> pack_rhs;
     ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,MatStorageOrder, false> pack_lhs;
-    ei_sybb_kernel<Scalar, Index, Blocking::mr, Blocking::nr, Conj, UpLo> sybb;
+    ei_sybb_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjLhs, ConjRhs, UpLo> sybb;
 
     for(Index k2=0; k2<depth; k2+=kc)
     {
@@ -163,7 +166,7 @@ SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
 //   while the selfadjoint block overlapping the diagonal is evaluated into a
 //   small temporary buffer which is then accumulated into the result using a
 //   triangular traversal.
-template<typename Scalar, typename Index, int mr, int nr, typename Conj, int UpLo>
+template<typename Scalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
 struct ei_sybb_kernel
 {
   enum {
@@ -172,7 +175,7 @@ struct ei_sybb_kernel
   };
   void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index size, Index depth, Scalar* workspace)
   {
-    ei_gebp_kernel<Scalar, Index, mr, nr, Conj> gebp_kernel;
+    ei_gebp_kernel<Scalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
     Matrix<Scalar,BlockSize,BlockSize,ColMajor> buffer;
 
     // let's process the block per panel of actual_mc x BlockSize,
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index fd0a7c2b2..be9362958 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -129,7 +129,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true,
     triangularBuffer.setZero();
     triangularBuffer.diagonal().setOnes();
 
-    ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel;
+    ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
     ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs;
     ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs;
 
@@ -254,10 +254,10 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false,
     triangularBuffer.setZero();
     triangularBuffer.diagonal().setOnes();
 
-    ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel;
+    ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
     ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,LhsStorageOrder> pack_lhs;
     ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs;
-    ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder,true> pack_rhs_panel;
+    ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder,false,true> pack_rhs_panel;
 
     for(Index k2=IsLower ? 0 : depth;
         IsLower ? k2<depth  : k2>0;
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index 7038627fb..0fce7159e 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -74,9 +74,9 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora
     Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
 
     ei_conj_if<Conjugate> conj;
-    ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ei_conj_helper<Conjugate,false> > gebp_kernel;
+    ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, Conjugate, false> gebp_kernel;
     ei_gemm_pack_lhs<Scalar, Index, Blocking::mr,TriStorageOrder> pack_lhs;
-    ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, ColMajor, true> pack_rhs;
+    ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, ColMajor, false, true> pack_rhs;
 
     for(Index k2=IsLower ? 0 : size;
         IsLower ? k2<size : k2>0;
@@ -212,9 +212,9 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor
     Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
 
     ei_conj_if<Conjugate> conj;
-    ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ei_conj_helper<false,Conjugate> > gebp_kernel;
+    ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, false, Conjugate> gebp_kernel;
     ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder> pack_rhs;
-    ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder,true> pack_rhs_panel;
+    ei_gemm_pack_rhs<Scalar, Index, Blocking::nr,RhsStorageOrder,false,true> pack_rhs_panel;
     ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, ColMajor, false, true> pack_lhs_panel;
 
     for(Index k2=IsLower ? size : 0;
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 8bcd8c95f..38c86511c 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -29,10 +29,10 @@
 // implement and control fast level 2 and level 3 BLAS-like routines.
 
 // forward declarations
-template<typename Scalar, typename Index, int mr, int nr, typename Conj>
+template<typename Scalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
 struct ei_gebp_kernel;
 
-template<typename Scalar, typename Index, int nr, int StorageOrder, bool PanelMode=false>
+template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
 struct ei_gemm_pack_rhs;
 
 template<typename Scalar, typename Index, int mr, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
@@ -53,46 +53,40 @@ template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index,
 static void ei_cache_friendly_product_rowmajor_times_vector(
   const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsSize, ResType& res, Scalar alpha);
 
-// Provides scalar/packet-wise product and product with accumulation
-// with optional conjugation of the arguments.
-template<bool ConjLhs, bool ConjRhs> struct ei_conj_helper;
-
-template<> struct ei_conj_helper<false,false>
+template<typename Scalar> struct ei_conj_helper<Scalar,Scalar,false,false>
 {
-  template<typename T>
-  EIGEN_STRONG_INLINE T pmadd(const T& x, const T& y, const T& c) const { return  ei_pmadd(x,y,c); }
-  template<typename T>
-  EIGEN_STRONG_INLINE T pmul(const T& x, const T& y) const { return  ei_pmul(x,y); }
+  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return  ei_pmadd(x,y,c); }
+  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const { return  ei_pmul(x,y); }
 };
 
-template<> struct ei_conj_helper<false,true>
+template<typename RealScalar> struct ei_conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, false,true>
 {
-  template<typename T> std::complex<T>
-  pmadd(const std::complex<T>& x, const std::complex<T>& y, const std::complex<T>& c) const
+  typedef std::complex<RealScalar> Scalar;
+  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const
   { return c + pmul(x,y); }
 
-  template<typename T> std::complex<T> pmul(const std::complex<T>& x, const std::complex<T>& y) const
-  { return std::complex<T>(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_imag(x)*ei_real(y) - ei_real(x)*ei_imag(y)); }
+  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const
+  { return Scalar(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_imag(x)*ei_real(y) - ei_real(x)*ei_imag(y)); }
 };
 
-template<> struct ei_conj_helper<true,false>
+template<typename RealScalar> struct ei_conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, true,false>
 {
-  template<typename T> std::complex<T>
-  pmadd(const std::complex<T>& x, const std::complex<T>& y, const std::complex<T>& c) const
+  typedef std::complex<RealScalar> Scalar;
+  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const
   { return c + pmul(x,y); }
 
-  template<typename T> std::complex<T> pmul(const std::complex<T>& x, const std::complex<T>& y) const
-  { return std::complex<T>(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); }
+  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const
+  { return Scalar(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); }
 };
 
-template<> struct ei_conj_helper<true,true>
+template<typename RealScalar> struct ei_conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, true,true>
 {
-  template<typename T> std::complex<T>
-  pmadd(const std::complex<T>& x, const std::complex<T>& y, const std::complex<T>& c) const
+  typedef std::complex<RealScalar> Scalar;
+  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const
   { return c + pmul(x,y); }
 
-  template<typename T> std::complex<T> pmul(const std::complex<T>& x, const std::complex<T>& y) const
-  { return std::complex<T>(ei_real(x)*ei_real(y) - ei_imag(x)*ei_imag(y), - ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); }
+  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const
+  { return Scalar(ei_real(x)*ei_real(y) - ei_imag(x)*ei_imag(y), - ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); }
 };
 
 // Lightweight helper class to access matrix coefficients.
@@ -140,6 +134,18 @@ struct ei_product_blocking_traits
   };
 };
 
+template<typename Real>
+struct ei_product_blocking_traits<std::complex<Real> >
+{
+  typedef std::complex<Real> Scalar;
+  typedef typename ei_packet_traits<Scalar>::type PacketType;
+  enum {
+    PacketSize = sizeof(PacketType)/sizeof(Scalar),
+    nr = 2,
+    mr = 2 * PacketSize
+  };
+};
+
 /* Helper class to analyze the factors of a Product expression.
  * In particular it allows to pop out operator-, scalar multiples,
  * and conjugate */
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index ef923d867..310ffa4b3 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -106,9 +106,14 @@ template<typename Lhs, typename Rhs,
          int ProductType = ei_product_type<Lhs,Rhs>::value>
 struct ProductReturnType;
 
+// Provides scalar/packet-wise product and product with accumulation
+// with optional conjugation of the arguments.
+template<typename LhsScalar, typename RhsScalar, bool ConjLhs, bool ConjRhs> struct ei_conj_helper;
+
 template<typename Scalar> struct ei_scalar_sum_op;
 template<typename Scalar> struct ei_scalar_difference_op;
 template<typename Scalar> struct ei_scalar_product_op;
+template<typename Scalar> struct ei_scalar_conj_product_op;
 template<typename Scalar> struct ei_scalar_quotient_op;
 template<typename Scalar> struct ei_scalar_opposite_op;
 template<typename Scalar> struct ei_scalar_conjugate_op;
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index f33b576ea..c93398092 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -151,7 +151,7 @@ class ei_compute_matrix_flags
 
           )
       ) ? AlignedBit : 0,
-      packet_access_bit = ei_packet_traits<Scalar>::size > 1 && aligned_bit ? PacketAccessBit : 0
+      packet_access_bit = ei_packet_traits<Scalar>::Vectorizable && aligned_bit ? PacketAccessBit : 0
     };
 
   public:
diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h
index 49a0d8f5d..94bb5569e 100644
--- a/Eigen/src/Jacobi/Jacobi.h
+++ b/Eigen/src/Jacobi/Jacobi.h
@@ -324,7 +324,7 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY&
 
     const Packet pc = ei_pset1(Scalar(j.c()));
     const Packet ps = ei_pset1(Scalar(j.s()));
-    ei_conj_helper<NumTraits<Scalar>::IsComplex,false> cj;
+    ei_conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex,false> pcj;
 
     for(Index i=0; i<alignedStart; ++i)
     {
@@ -343,7 +343,7 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY&
       {
         Packet xi = ei_pload(px);
         Packet yi = ei_pload(py);
-        ei_pstore(px, ei_padd(ei_pmul(pc,xi),cj.pmul(ps,yi)));
+        ei_pstore(px, ei_padd(ei_pmul(pc,xi),pcj.pmul(ps,yi)));
         ei_pstore(py, ei_psub(ei_pmul(pc,yi),ei_pmul(ps,xi)));
         px += PacketSize;
         py += PacketSize;
@@ -358,8 +358,8 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY&
         Packet xi1  = ei_ploadu(px+PacketSize);
         Packet yi   = ei_pload (py);
         Packet yi1  = ei_pload (py+PacketSize);
-        ei_pstoreu(px, ei_padd(ei_pmul(pc,xi),cj.pmul(ps,yi)));
-        ei_pstoreu(px+PacketSize, ei_padd(ei_pmul(pc,xi1),cj.pmul(ps,yi1)));
+        ei_pstoreu(px, ei_padd(ei_pmul(pc,xi),pcj.pmul(ps,yi)));
+        ei_pstoreu(px+PacketSize, ei_padd(ei_pmul(pc,xi1),pcj.pmul(ps,yi1)));
         ei_pstore (py, ei_psub(ei_pmul(pc,yi),ei_pmul(ps,xi)));
         ei_pstore (py+PacketSize, ei_psub(ei_pmul(pc,yi1),ei_pmul(ps,xi1)));
         px += Peeling*PacketSize;
@@ -369,7 +369,7 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY&
       {
         Packet xi = ei_ploadu(x+peelingEnd);
         Packet yi = ei_pload (y+peelingEnd);
-        ei_pstoreu(x+peelingEnd, ei_padd(ei_pmul(pc,xi),cj.pmul(ps,yi)));
+        ei_pstoreu(x+peelingEnd, ei_padd(ei_pmul(pc,xi),pcj.pmul(ps,yi)));
         ei_pstore (y+peelingEnd, ei_psub(ei_pmul(pc,yi),ei_pmul(ps,xi)));
       }
     }