mixing types in product step 2:

* pload* and pset1 are now templated on the packet type * gemv routines are now embeded into a structure with a consistent API with respect to gemm * some configurations of vector * matrix and matrix * matrix works fine, some need more work...
author: Gael Guennebaud <g.gael@free.fr> 2010-07-11 15:48:30 +0200
committer: Gael Guennebaud <g.gael@free.fr> 2010-07-11 15:48:30 +0200
commit: ff96c94043d575e4d0dd477c1ed2487e33f79627 (patch)
tree: 5e9736916779fdacd431c2591a3ec1f77333e505 /Eigen
parent: 4161b8be6772f2b7338458c9932d7417797966bb (diff)
20 files changed, 603 insertions, 552 deletions
diff --git a/Eigen/src/Core/DenseStorageBase.h b/Eigen/src/Core/DenseStorageBase.h
index 6886e3f97..16e0a86f1 100644
--- a/Eigen/src/Core/DenseStorageBase.h
+++ b/Eigen/src/Core/DenseStorageBase.h
@@ -108,7 +108,7 @@ class DenseStorageBase : public ei_dense_xpr_base<Derived>::type
     template<int LoadMode>
     EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const
     {
-      return ei_ploadt<Scalar, LoadMode>
+      return ei_ploadt<PacketScalar, LoadMode>
                (m_storage.data() + (Flags & RowMajorBit
                                    ? col + row * m_storage.cols()
                                    : row + col * m_storage.rows()));
@@ -117,7 +117,7 @@ class DenseStorageBase : public ei_dense_xpr_base<Derived>::type
     template<int LoadMode>
     EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
     {
-      return ei_ploadt<Scalar, LoadMode>(m_storage.data() + index);
+      return ei_ploadt<PacketScalar, LoadMode>(m_storage.data() + index);
     }
 
     template<int StoreMode>
diff --git a/Eigen/src/Core/DiagonalProduct.h b/Eigen/src/Core/DiagonalProduct.h
index 610d13dc8..14c49e828 100644
--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h
@@ -91,7 +91,7 @@ class DiagonalProduct : ei_no_assignment_operator,
     EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, ei_meta_true) const
     {
       return ei_pmul(m_matrix.template packet<LoadMode>(row, col),
-                     ei_pset1(m_diagonal.diagonal().coeff(id)));
+                     ei_pset1<PacketScalar>(m_diagonal.diagonal().coeff(id)));
     }
 
     template<int LoadMode>
diff --git a/Eigen/src/Core/Functors.h b/Eigen/src/Core/Functors.h
index 9084905aa..3f059d233 100644
--- a/Eigen/src/Core/Functors.h
+++ b/Eigen/src/Core/Functors.h
@@ -35,11 +35,11 @@
 template<typename Scalar> struct ei_scalar_sum_op {
   EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_sum_op)
   EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a + b; }
-  template<typename PacketScalar>
-  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return ei_padd(a,b); }
-  template<typename PacketScalar>
-  EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
   { return ei_predux(a); }
 };
 template<typename Scalar>
@@ -58,11 +58,11 @@ struct ei_functor_traits<ei_scalar_sum_op<Scalar> > {
 template<typename Scalar> struct ei_scalar_product_op {
   EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_product_op)
   EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a * b; }
-  template<typename PacketScalar>
-  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return ei_pmul(a,b); }
-  template<typename PacketScalar>
-  EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
   { return ei_predux_mul(a); }
 };
 template<typename Scalar>
@@ -83,9 +83,9 @@ template<typename Scalar> struct ei_scalar_conj_product_op {
   EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_conj_product_op)
   EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const
   { return ei_conj_helper<Scalar,Scalar,Conj,false>().pmul(a,b); }
-  template<typename PacketScalar>
-  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
-  { return ei_conj_helper<PacketScalar,PacketScalar,Conj,false>().pmul(a,b); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return ei_conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_conj_product_op<Scalar> > {
@@ -103,11 +103,11 @@ struct ei_functor_traits<ei_scalar_conj_product_op<Scalar> > {
 template<typename Scalar> struct ei_scalar_min_op {
   EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_min_op)
   EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return std::min(a, b); }
-  template<typename PacketScalar>
-  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return ei_pmin(a,b); }
-  template<typename PacketScalar>
-  EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
   { return ei_predux_min(a); }
 };
 template<typename Scalar>
@@ -126,11 +126,11 @@ struct ei_functor_traits<ei_scalar_min_op<Scalar> > {
 template<typename Scalar> struct ei_scalar_max_op {
   EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_max_op)
   EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return std::max(a, b); }
-  template<typename PacketScalar>
-  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return ei_pmax(a,b); }
-  template<typename PacketScalar>
-  EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
   { return ei_predux_max(a); }
 };
 template<typename Scalar>
@@ -172,8 +172,8 @@ struct ei_functor_traits<ei_scalar_hypot_op<Scalar> > {
 template<typename Scalar> struct ei_scalar_difference_op {
   EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_difference_op)
   EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a - b; }
-  template<typename PacketScalar>
-  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return ei_psub(a,b); }
 };
 template<typename Scalar>
@@ -192,8 +192,8 @@ struct ei_functor_traits<ei_scalar_difference_op<Scalar> > {
 template<typename Scalar> struct ei_scalar_quotient_op {
   EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_quotient_op)
   EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a / b; }
-  template<typename PacketScalar>
-  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return ei_pdiv(a,b); }
 };
 template<typename Scalar>
@@ -214,8 +214,8 @@ struct ei_functor_traits<ei_scalar_quotient_op<Scalar> > {
 template<typename Scalar> struct ei_scalar_opposite_op {
   EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_opposite_op)
   EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return -a; }
-  template<typename PacketScalar>
-  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
   { return ei_pnegate(a); }
 };
 template<typename Scalar>
@@ -234,8 +234,8 @@ template<typename Scalar> struct ei_scalar_abs_op {
   EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_abs_op)
   typedef typename NumTraits<Scalar>::Real result_type;
   EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return ei_abs(a); }
-  template<typename PacketScalar>
-  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
   { return ei_pabs(a); }
 };
 template<typename Scalar>
@@ -256,8 +256,8 @@ template<typename Scalar> struct ei_scalar_abs2_op {
   EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_abs2_op)
   typedef typename NumTraits<Scalar>::Real result_type;
   EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return ei_abs2(a); }
-  template<typename PacketScalar>
-  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
   { return ei_pmul(a,a); }
 };
 template<typename Scalar>
@@ -272,8 +272,8 @@ struct ei_functor_traits<ei_scalar_abs2_op<Scalar> >
 template<typename Scalar> struct ei_scalar_conjugate_op {
   EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_conjugate_op)
   EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return ei_conj(a); }
-  template<typename PacketScalar>
-  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const { return ei_pconj(a); }
+  template<typename Packet>
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return ei_pconj(a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_conjugate_op<Scalar> >
@@ -397,22 +397,22 @@ struct ei_functor_traits<ei_scalar_log_op<Scalar> >
   * \sa class CwiseUnaryOp, MatrixBase::operator*, MatrixBase::operator/
   */
 /* NOTE why doing the ei_pset1() in packetOp *is* an optimization ?
- * indeed it seems better to declare m_other as a PacketScalar and do the ei_pset1() once
+ * indeed it seems better to declare m_other as a Packet and do the ei_pset1() once
  * in the constructor. However, in practice:
- *  - GCC does not like m_other as a PacketScalar and generate a load every time it needs it
+ *  - GCC does not like m_other as a Packet and generate a load every time it needs it
  *  - on the other hand GCC is able to moves the ei_pset1() away the loop :)
  *  - simpler code ;)
  * (ICC and gcc 4.4 seems to perform well in both cases, the issue is visible with y = a*x + b*y)
  */
 template<typename Scalar>
 struct ei_scalar_multiple_op {
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+  typedef typename ei_packet_traits<Scalar>::type Packet;
   // FIXME default copy constructors seems bugged with std::complex<>
   EIGEN_STRONG_INLINE ei_scalar_multiple_op(const ei_scalar_multiple_op& other) : m_other(other.m_other) { }
   EIGEN_STRONG_INLINE ei_scalar_multiple_op(const Scalar& other) : m_other(other) { }
   EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; }
-  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const
-  { return ei_pmul(a, ei_pset1(m_other)); }
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return ei_pmul(a, ei_pset1<Packet>(m_other)); }
   typename ei_makeconst<typename NumTraits<Scalar>::Nested>::type m_other;
 };
 template<typename Scalar>
@@ -433,13 +433,13 @@ struct ei_functor_traits<ei_scalar_multiple2_op<Scalar1,Scalar2> >
 
 template<typename Scalar, bool IsInteger>
 struct ei_scalar_quotient1_impl {
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+  typedef typename ei_packet_traits<Scalar>::type Packet;
   // FIXME default copy constructors seems bugged with std::complex<>
   EIGEN_STRONG_INLINE ei_scalar_quotient1_impl(const ei_scalar_quotient1_impl& other) : m_other(other.m_other) { }
   EIGEN_STRONG_INLINE ei_scalar_quotient1_impl(const Scalar& other) : m_other(static_cast<Scalar>(1) / other) {}
   EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; }
-  EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a) const
-  { return ei_pmul(a, ei_pset1(m_other)); }
+  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return ei_pmul(a, ei_pset1<Packet>(m_other)); }
   const Scalar m_other;
 };
 template<typename Scalar>
@@ -480,13 +480,13 @@ struct ei_functor_traits<ei_scalar_quotient1_op<Scalar> >
 
 template<typename Scalar>
 struct ei_scalar_constant_op {
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+  typedef typename ei_packet_traits<Scalar>::type Packet;
   EIGEN_STRONG_INLINE ei_scalar_constant_op(const ei_scalar_constant_op& other) : m_other(other.m_other) { }
   EIGEN_STRONG_INLINE ei_scalar_constant_op(const Scalar& other) : m_other(other) { }
   template<typename Index>
   EIGEN_STRONG_INLINE const Scalar operator() (Index, Index = 0) const { return m_other; }
   template<typename Index>
-  EIGEN_STRONG_INLINE const PacketScalar packetOp(Index, Index = 0) const { return ei_pset1(m_other); }
+  EIGEN_STRONG_INLINE const Packet packetOp(Index, Index = 0) const { return ei_pset1<Packet>(m_other); }
   const Scalar m_other;
 };
 template<typename Scalar>
@@ -513,22 +513,22 @@ template <typename Scalar, bool RandomAccess> struct ei_linspaced_op_impl;
 template <typename Scalar>
 struct ei_linspaced_op_impl<Scalar,false>
 {
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+  typedef typename ei_packet_traits<Scalar>::type Packet;
 
   ei_linspaced_op_impl(Scalar low, Scalar step) :
   m_low(low), m_step(step),
-  m_packetStep(ei_pset1(ei_packet_traits<Scalar>::size*step)),
-  m_base(ei_padd(ei_pset1(low),ei_pmul(ei_pset1(step),ei_plset<Scalar>(-ei_packet_traits<Scalar>::size)))) {}
+  m_packetStep(ei_pset1<Packet>(ei_packet_traits<Scalar>::size*step)),
+  m_base(ei_padd(ei_pset1<Packet>(low),ei_pmul(ei_pset1<Packet>(step),ei_plset<Scalar>(-ei_packet_traits<Scalar>::size)))) {}
 
   template<typename Index>
   EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; }
   template<typename Index>
-  EIGEN_STRONG_INLINE const PacketScalar packetOp(Index) const { return m_base = ei_padd(m_base,m_packetStep); }
+  EIGEN_STRONG_INLINE const Packet packetOp(Index) const { return m_base = ei_padd(m_base,m_packetStep); }
 
   const Scalar m_low;
   const Scalar m_step;
-  const PacketScalar m_packetStep;
-  mutable PacketScalar m_base;
+  const Packet m_packetStep;
+  mutable Packet m_base;
 };
 
 // random access for packet ops:
@@ -537,23 +537,23 @@ struct ei_linspaced_op_impl<Scalar,false>
 template <typename Scalar>
 struct ei_linspaced_op_impl<Scalar,true>
 {
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+  typedef typename ei_packet_traits<Scalar>::type Packet;
 
   ei_linspaced_op_impl(Scalar low, Scalar step) :
   m_low(low), m_step(step),
-  m_lowPacket(ei_pset1(m_low)), m_stepPacket(ei_pset1(m_step)), m_interPacket(ei_plset<Scalar>(0)) {}
+  m_lowPacket(ei_pset1<Packet>(m_low)), m_stepPacket(ei_pset1<Packet>(m_step)), m_interPacket(ei_plset<Scalar>(0)) {}
 
   template<typename Index>
   EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; }
   template<typename Index>
-  EIGEN_STRONG_INLINE const PacketScalar packetOp(Index i) const
-  { return ei_padd(m_lowPacket, ei_pmul(m_stepPacket, ei_padd(ei_pset1<Scalar>(i),m_interPacket))); }
+  EIGEN_STRONG_INLINE const Packet packetOp(Index i) const
+  { return ei_padd(m_lowPacket, ei_pmul(m_stepPacket, ei_padd(ei_pset1<Packet>(i),m_interPacket))); }
 
   const Scalar m_low;
   const Scalar m_step;
-  const PacketScalar m_lowPacket;
-  const PacketScalar m_stepPacket;
-  const PacketScalar m_interPacket;
+  const Packet m_lowPacket;
+  const Packet m_stepPacket;
+  const Packet m_interPacket;
 };
 
 // ----- Linspace functor ----------------------------------------------------------------
@@ -566,12 +566,12 @@ template <typename Scalar, bool RandomAccess> struct ei_functor_traits< ei_linsp
 { enum { Cost = 1, PacketAccess = ei_packet_traits<Scalar>::HasSetLinear, IsRepeatable = true }; };
 template <typename Scalar, bool RandomAccess> struct ei_linspaced_op
 {
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+  typedef typename ei_packet_traits<Scalar>::type Packet;
   ei_linspaced_op(Scalar low, Scalar high, int num_steps) : impl(low, (high-low)/(num_steps-1)) {}
   template<typename Index>
   EIGEN_STRONG_INLINE const Scalar operator() (Index i, Index = 0) const { return impl(i); }
   template<typename Index>
-  EIGEN_STRONG_INLINE const PacketScalar packetOp(Index i, Index = 0) const { return impl.packetOp(i); }
+  EIGEN_STRONG_INLINE const Packet packetOp(Index i, Index = 0) const { return impl.packetOp(i); }
   // This proxy object handles the actual required temporaries, the different
   // implementations (random vs. sequential access) as well as the piping
   // correct piping to size 2/4 packet operations.
@@ -597,13 +597,13 @@ template<typename Scalar> struct ei_functor_allows_mixing_real_and_complex<ei_sc
 /* If you wonder why doing the ei_pset1() in packetOp() is an optimization check ei_scalar_multiple_op */
 template<typename Scalar>
 struct ei_scalar_add_op {
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+  typedef typename ei_packet_traits<Scalar>::type Packet;
   // FIXME default copy constructors seems bugged with std::complex<>
   inline ei_scalar_add_op(const ei_scalar_add_op& other) : m_other(other.m_other) { }
   inline ei_scalar_add_op(const Scalar& other) : m_other(other) { }
   inline Scalar operator() (const Scalar& a) const { return a + m_other; }
-  inline const PacketScalar packetOp(const PacketScalar& a) const
-  { return ei_padd(a, ei_pset1(m_other)); }
+  inline const Packet packetOp(const Packet& a) const
+  { return ei_padd(a, ei_pset1<Packet>(m_other)); }
   const Scalar m_other;
 };
 template<typename Scalar>
@@ -690,9 +690,9 @@ template<typename Scalar>
 struct ei_scalar_inverse_op {
   EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_inverse_op)
   inline Scalar operator() (const Scalar& a) const { return Scalar(1)/a; }
-  template<typename PacketScalar>
-  inline const PacketScalar packetOp(const PacketScalar& a) const
-  { return ei_pdiv(ei_pset1(Scalar(1)),a); }
+  template<typename Packet>
+  inline const Packet packetOp(const Packet& a) const
+  { return ei_pdiv(ei_pset1<Packet>(Scalar(1)),a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_inverse_op<Scalar> >
@@ -706,8 +706,8 @@ template<typename Scalar>
 struct ei_scalar_square_op {
   EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_square_op)
   inline Scalar operator() (const Scalar& a) const { return a*a; }
-  template<typename PacketScalar>
-  inline const PacketScalar packetOp(const PacketScalar& a) const
+  template<typename Packet>
+  inline const Packet packetOp(const Packet& a) const
   { return ei_pmul(a,a); }
 };
 template<typename Scalar>
@@ -722,8 +722,8 @@ template<typename Scalar>
 struct ei_scalar_cube_op {
   EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_cube_op)
   inline Scalar operator() (const Scalar& a) const { return a*a*a; }
-  template<typename PacketScalar>
-  inline const PacketScalar packetOp(const PacketScalar& a) const
+  template<typename Packet>
+  inline const Packet packetOp(const Packet& a) const
   { return ei_pmul(a,ei_pmul(a,a)); }
 };
 template<typename Scalar>
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index d290074e5..914f16e73 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -160,16 +160,16 @@ template<typename Packet> inline Packet
 ei_pandnot(const Packet& a, const Packet& b) { return a & (!b); }
 
 /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
-template<typename Scalar> inline typename ei_packet_traits<Scalar>::type
-ei_pload(const Scalar* from) { return *from; }
+template<typename Packet> inline Packet
+ei_pload(const typename ei_unpacket_traits<Packet>::type* from) { return *from; }
 
 /** \internal \returns a packet version of \a *from, (un-aligned load) */
-template<typename Scalar> inline typename ei_packet_traits<Scalar>::type
-ei_ploadu(const Scalar* from) { return *from; }
+template<typename Packet> inline Packet
+ei_ploadu(const typename ei_unpacket_traits<Packet>::type* from) { return *from; }
 
 /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
-template<typename Scalar> inline typename ei_packet_traits<Scalar>::type
-ei_pset1(const Scalar& a) { return a; }
+template<typename Packet> inline Packet
+ei_pset1(const typename ei_unpacket_traits<Packet>::type& a) { return a; }
 
 /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
 template<typename Scalar> inline typename ei_packet_traits<Scalar>::type
@@ -256,13 +256,13 @@ ei_pmadd(const Packet&  a,
 
 /** \internal \returns a packet version of \a *from.
   * \If LoadMode equals Aligned, \a from must be 16 bytes aligned */
-template<typename Scalar, int LoadMode>
-inline typename ei_packet_traits<Scalar>::type ei_ploadt(const Scalar* from)
+template<typename Packet, int LoadMode>
+inline Packet ei_ploadt(const typename ei_unpacket_traits<Packet>::type* from)
 {
   if(LoadMode == Aligned)
-    return ei_pload(from);
+    return ei_pload<Packet>(from);
   else
-    return ei_ploadu(from);
+    return ei_ploadu<Packet>(from);
 }
 
 /** \internal copy the packet \a from to \a *to.
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index 6af7bb793..740f2b247 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -124,14 +124,14 @@ template<typename Derived> class MapBase
     template<int LoadMode>
     inline PacketScalar packet(Index row, Index col) const
     {
-      return ei_ploadt<Scalar, LoadMode>
+      return ei_ploadt<PacketScalar, LoadMode>
                (m_data + (col * colStride() + row * rowStride()));
     }
 
     template<int LoadMode>
     inline PacketScalar packet(Index index) const
     {
-      return ei_ploadt<Scalar, LoadMode>(m_data + index * innerStride());
+      return ei_ploadt<PacketScalar, LoadMode>(m_data + index * innerStride());
     }
 
     template<int StoreMode>
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 139132c6b..edd79bd9a 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -282,10 +282,13 @@ class GeneralProduct<Lhs, Rhs, GemvProduct>
   public:
     EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
 
+    typedef typename Lhs::Scalar LhsScalar;
+    typedef typename Rhs::Scalar RhsScalar;
+
     GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
     {
-      EIGEN_STATIC_ASSERT((ei_is_same_type<typename Lhs::Scalar, typename Rhs::Scalar>::ret),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+//       EIGEN_STATIC_ASSERT((ei_is_same_type<typename Lhs::Scalar, typename Rhs::Scalar>::ret),
+//         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
     }
 
     enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
@@ -295,7 +298,8 @@ class GeneralProduct<Lhs, Rhs, GemvProduct>
     {
       ei_assert(m_lhs.rows() == dst.rows() && m_rhs.cols() == dst.cols());
       ei_gemv_selector<Side,(int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
-                       bool(ei_blas_traits<MatrixType>::HasUsableDirectAccess)>::run(*this, dst, alpha);
+                       bool(ei_blas_traits<MatrixType>::HasUsableDirectAccess)
+                       /*&& ei_is_same_type<typename Lhs::Scalar, typename Rhs::Scalar>::ret*/>::run(*this, dst, alpha);
     }
 };
 
@@ -319,43 +323,48 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>
   template<typename ProductType, typename Dest>
   static void run(const ProductType& prod, Dest& dest, typename ProductType::Scalar alpha)
   {
-    typedef typename ProductType::Scalar Scalar;
+    typedef typename ProductType::Index Index;
+    typedef typename ProductType::LhsScalar LhsScalar;
+    typedef typename ProductType::RhsScalar RhsScalar;
+    typedef typename ProductType::Scalar    ResScalar;
     typedef typename ProductType::ActualLhsType ActualLhsType;
     typedef typename ProductType::ActualRhsType ActualRhsType;
     typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
     typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
-    typedef Map<Matrix<Scalar,Dynamic,1>, Aligned> MappedDest;
+    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
 
     ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs());
     ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs());
 
-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
-                               * RhsBlasTraits::extractScalarFactor(prod.rhs());
+    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
+                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
 
     enum {
       // FIXME find a way to allow an inner stride on the result if ei_packet_traits<Scalar>::size==1
       EvalToDest = Dest::InnerStrideAtCompileTime==1
     };
 
-    Scalar* EIGEN_RESTRICT actualDest;
+    ResScalar* actualDest;
     if (EvalToDest)
       actualDest = &dest.coeffRef(0);
     else
     {
-      actualDest = ei_aligned_stack_new(Scalar,dest.size());
+      actualDest = ei_aligned_stack_new(ResScalar,dest.size());
       MappedDest(actualDest, dest.size()) = dest;
     }
 
-    ei_cache_friendly_product_colmajor_times_vector
-      <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate>(
-      dest.size(),
-      &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(),
-      actualRhs, actualDest, actualAlpha);
+    ei_general_matrix_vector_product
+      <Index,LhsScalar,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
+        actualLhs.rows(), actualLhs.cols(),
+        &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(),
+        actualRhs, actualRhs.innerStride(),
+        actualDest, 1,
+        actualAlpha);
 
     if (!EvalToDest)
     {
       dest = MappedDest(actualDest, dest.size());
-      ei_aligned_stack_delete(Scalar, actualDest, dest.size());
+      ei_aligned_stack_delete(ResScalar, actualDest, dest.size());
     }
   }
 };
@@ -365,7 +374,9 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true>
   template<typename ProductType, typename Dest>
   static void run(const ProductType& prod, Dest& dest, typename ProductType::Scalar alpha)
   {
-    typedef typename ProductType::Scalar Scalar;
+    typedef typename ProductType::LhsScalar LhsScalar;
+    typedef typename ProductType::RhsScalar RhsScalar;
+    typedef typename ProductType::Scalar    ResScalar;
     typedef typename ProductType::Index Index;
     typedef typename ProductType::ActualLhsType ActualLhsType;
     typedef typename ProductType::ActualRhsType ActualRhsType;
@@ -376,34 +387,34 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true>
     ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs());
     ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs());
 
-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
-                               * RhsBlasTraits::extractScalarFactor(prod.rhs());
+    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
+                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
 
     enum {
       // FIXME I think here we really have to check for ei_packet_traits<Scalar>::size==1
       // because in this case it is fine to have an inner stride
-      DirectlyUseRhs = ((ei_packet_traits<Scalar>::size==1) || (_ActualRhsType::Flags&ActualPacketAccessBit))
+      DirectlyUseRhs = ((ei_packet_traits<RhsScalar>::size==1) || (_ActualRhsType::Flags&ActualPacketAccessBit))
                      && (!(_ActualRhsType::Flags & RowMajorBit))
     };
 
-    Scalar* EIGEN_RESTRICT rhs_data;
+    RhsScalar* rhs_data;
     if (DirectlyUseRhs)
-       rhs_data = reinterpret_cast<Scalar* EIGEN_RESTRICT>(&actualRhs.const_cast_derived().coeffRef(0));
+       rhs_data = &actualRhs.const_cast_derived().coeffRef(0);
     else
     {
-      rhs_data = ei_aligned_stack_new(Scalar, actualRhs.size());
-      Map<typename _ActualRhsType::PlainObject>(reinterpret_cast<Scalar*>(rhs_data), actualRhs.size()) = actualRhs;
+      rhs_data = ei_aligned_stack_new(RhsScalar, actualRhs.size());
+      Map<typename _ActualRhsType::PlainObject>(rhs_data, actualRhs.size()) = actualRhs;
     }
 
-    ei_cache_friendly_product_rowmajor_times_vector
-      <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate, Scalar, Index>(
+    ei_general_matrix_vector_product
+      <Index,LhsScalar,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
         actualLhs.rows(), actualLhs.cols(),
         &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(),
         rhs_data, 1,
         &dest.coeffRef(0,0), dest.innerStride(),
         actualAlpha);
 
-    if (!DirectlyUseRhs) ei_aligned_stack_delete(Scalar, rhs_data, prod.rhs().size());
+    if (!DirectlyUseRhs) ei_aligned_stack_delete(RhsScalar, rhs_data, prod.rhs().size());
   }
 };
 
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index 90ce2a802..f9e24a193 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -81,7 +81,7 @@ struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,RowMajor
         Index startRow = IsLower ? pi : pi-actualPanelWidth;
         Index startCol = IsLower ? 0 : pi;
 
-        ei_cache_friendly_product_rowmajor_times_vector<LhsProductTraits::NeedToConjugate,false,Scalar,Index>(
+        ei_general_matrix_vector_product<Index,Scalar,RowMajor,LhsProductTraits::NeedToConjugate,Scalar,false>::run(
           actualPanelWidth, r,
           &(actualLhs.const_cast_derived().coeffRef(startRow,startCol)), actualLhs.outerStride(),
           &(other.coeffRef(startCol)), other.innerStride(),
@@ -148,12 +148,11 @@ struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,ColMajor
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
         // 2 - it is slighlty faster at runtime
-        ei_cache_friendly_product_colmajor_times_vector<LhsProductTraits::NeedToConjugate,false>(
-          r,
-          &(actualLhs.const_cast_derived().coeffRef(endBlock,startBlock)), actualLhs.outerStride(),
-          other.segment(startBlock, actualPanelWidth),
-          &(other.coeffRef(endBlock, 0)),
-          Scalar(-1));
+        ei_general_matrix_vector_product<Index,Scalar,ColMajor,LhsProductTraits::NeedToConjugate,Scalar,false>::run(
+            r, actualPanelWidth,
+            &(actualLhs.const_cast_derived().coeffRef(endBlock,startBlock)), actualLhs.outerStride(),
+            other.segment(startBlock, actualPanelWidth), other.innerStride(),
+            &(other.coeffRef(endBlock, 0)), other.innerStride(), Scalar(-1));
       }
     }
   }
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index 2dba95a2f..ecada02f4 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -63,7 +63,7 @@ template<> struct ei_packet_traits<std::complex<float> >  : ei_default_packet_tr
 
 template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
 
-template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>&  from)
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<Packet2cf>(const std::complex<float>&  from)
 {
   Packet2cf res;
   /* On AltiVec we cannot load 64-bit registers, so wa have to take care of alignment */
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index a3ceed8e8..8205beae5 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -59,13 +59,13 @@ typedef __vector unsigned char  Packet16uc;
   Packet4i ei_p4i_##NAME = vec_splat_s32(X)
 
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
-  Packet4f ei_p4f_##NAME = ei_pset1<float>(X)
+  Packet4f ei_p4f_##NAME = ei_pset1<Packet4f>(X)
 
 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
   Packet4f ei_p4f_##NAME = vreinterpretq_f32_u32(ei_pset1<int>(X))
 
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
-  Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
+  Packet4i ei_p4i_##NAME = ei_pset1<Packet4i>(X)
 
 #define DST_CHAN 1
 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
@@ -158,7 +158,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
   return s;
 }
 */
-template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) {
+template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float&  from) {
   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
   float EIGEN_ALIGN16 af[4];
   af[0] = from;
@@ -167,7 +167,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) {
   return vc;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int&    from)   {
+template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<Packet4i>(const int&    from)   {
   int EIGEN_ALIGN16 ai[4];
   ai[0] = from;
   Packet4i vc = vec_ld(0, ai);
@@ -175,8 +175,8 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int&    from)   {
   return vc;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return vec_add(ei_pset1(a), ei_p4f_COUNTDOWN); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a)     { return vec_add(ei_pset1(a), ei_p4i_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return vec_add(ei_pset1<Packet4f>(a), ei_p4f_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a)     { return vec_add(ei_pset1<Packet4i>(a), ei_p4i_COUNTDOWN); }
 
 template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i ei_padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); }
@@ -241,7 +241,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, con
 
 template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
 { ei_assert(false && "packet integer division are not supported by AltiVec");
-  return ei_pset1<int>(0);
+  return ei_pset1<Packet4i>(0);
 }
 
 // for some weird raisons, it has to be overloaded for packet of integers
@@ -267,10 +267,10 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pxor<Packet4i>(const Packet4i& a, con
 template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
 template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
 
-template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
 
-template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
+template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu<Packet4f>(const float* from)
 {
   EIGEN_DEBUG_ALIGNED_LOAD
   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
@@ -282,7 +282,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
   return (Packet4f) vec_perm(MSQ, LSQ, mask);           // align the data
 
 }
-template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)
+template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<Packet4i>(const int* from)
 {
   EIGEN_DEBUG_ALIGNED_LOAD
   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index bf68a2bbb..6d9e8da85 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -58,7 +58,7 @@ template<> struct ei_packet_traits<std::complex<float> >  : ei_default_packet_tr
 
 template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
 
-template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>&  from)
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<Packet2cf>(const std::complex<float>&  from)
 {
   float32x2_t r64;
   r64 = vld1_f32((float *)&from);
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 5b0d6ab12..b899fece1 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -45,13 +45,13 @@ typedef float32x4_t Packet4f;
 typedef int32x4_t   Packet4i;
 
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
-  const Packet4f ei_p4f_##NAME = ei_pset1<float>(X)
+  const Packet4f ei_p4f_##NAME = ei_pset1<Packet4f>(X)
 
 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
   const Packet4f ei_p4f_##NAME = vreinterpretq_f32_u32(ei_pset1<int>(X))
 
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
-  const Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
+  const Packet4i ei_p4i_##NAME = ei_pset1<Packet4i>(X)
 
 #ifndef __pld
 #define __pld(x) asm volatile ( "   pld [%[addr]]\n" :: [addr] "r" (x) : "cc" );
@@ -88,18 +88,18 @@ template<> struct ei_packet_traits<int>    : ei_default_packet_traits
 template<> struct ei_unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
 template<> struct ei_unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
 
-template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) { return vdupq_n_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int&    from)   { return vdupq_n_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float&  from) { return vdupq_n_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<Packet4i>(const int&    from)   { return vdupq_n_s32(from); }
 
 template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a)
 {
   Packet4f countdown = { 3, 2, 1, 0 };
-  return vaddq_f32(ei_pset1(a), countdown);
+  return vaddq_f32(ei_pset1<Packet4f>(a), countdown);
 }
 template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a)
 {
   Packet4i countdown = { 3, 2, 1, 0 };
-  return vaddq_s32(ei_pset1(a), countdown);
+  return vaddq_s32(ei_pset1<Packet4i>(a), countdown);
 }
 
 template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
@@ -137,7 +137,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, con
 }
 template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
 { ei_assert(false && "packet integer division are not supported by NEON");
-  return ei_pset1<int>(0);
+  return ei_pset1<Packet4i>(0);
 }
 
 // for some weird raisons, it has to be overloaded for packet of integers
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 9d32ede0e..6c72293fc 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -89,15 +89,15 @@ template<> EIGEN_STRONG_INLINE Packet2cf ei_por    <Packet2cf>(const Packet2cf&
 template<> EIGEN_STRONG_INLINE Packet2cf ei_pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf ei_pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); }
 
-template<> EIGEN_STRONG_INLINE Packet2cf ei_pload <std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(ei_pload(&ei_real_ref(*from))); }
-template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu<std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ei_ploadu(&ei_real_ref(*from))); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(ei_pload<Packet4f>(&ei_real_ref(*from))); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ei_ploadu<Packet4f>(&ei_real_ref(*from))); }
 
 template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstore(&ei_real_ref(*to), from.v); }
 template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu(&ei_real_ref(*to), from.v); }
 
 template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 
-template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>&  from)
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<Packet2cf>(const std::complex<float>&  from)
 {
   Packet2cf res;
   res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
@@ -276,10 +276,12 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_pxor   <Packet1cd>(const Packet1cd&
 template<> EIGEN_STRONG_INLINE Packet1cd ei_pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); }
 
 // FIXME force unaligned load, this is a temporary fix 
-template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu<std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1<std::complex<double> >(const std::complex<double>&  from)
-{ /* here we really have to use unaligned loads :( */ return ei_ploadu(&from); }
+template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <Packet1cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu<Packet1cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ei_ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1<Packet1cd>(const std::complex<double>&  from)
+{ /* here we really have to use unaligned loads :( */ return ei_ploadu<Packet1cd>(&from); }
 
 // FIXME force unaligned store, this is a temporary fix
 template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstoreu((double*)to, from.v); }
@@ -387,6 +389,15 @@ template<> struct ei_conj_helper<Packet2d, Packet1cd, false,false>
   { return Packet1cd(ei_pmul(x, y.v)); }
 };
 
+template<> struct ei_conj_helper<Packet1cd, Packet2d, false,false>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
+  { return ei_padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
+  { return Packet1cd(ei_pmul(x.v, y)); }
+};
+
 template<> EIGEN_STRONG_INLINE Packet1cd ei_pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
   // TODO optimize it for SSE3 and 4
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 742bfa92f..e4ca82985 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -378,14 +378,14 @@ Packet4f ei_pcos<Packet4f>(const Packet4f& _x)
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f ei_psqrt<Packet4f>(const Packet4f& _x)
 {
-	Packet4f half = ei_pmul(_x, ei_pset1(.5f));
-	
-	/* select only the inverse sqrt of non-zero inputs */
-	Packet4f non_zero_mask = _mm_cmpgt_ps(_x, ei_pset1(std::numeric_limits<float>::epsilon()));
-	Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x));
-
-	x = ei_pmul(x, ei_psub(ei_pset1(1.5f), ei_pmul(half, ei_pmul(x,x))));
-	return ei_pmul(_x,x);
+  Packet4f half = ei_pmul(_x, ei_pset1<Packet4f>(.5f));
+
+  /* select only the inverse sqrt of non-zero inputs */
+  Packet4f non_zero_mask = _mm_cmpgt_ps(_x, ei_pset1<Packet4f>(std::numeric_limits<float>::epsilon()));
+  Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x));
+
+  x = ei_pmul(x, ei_psub(ei_pset1<Packet4f>(1.5f), ei_pmul(half, ei_pmul(x,x))));
+  return ei_pmul(_x,x);
 }
 
 #endif // EIGEN_MATH_FUNCTIONS_SSE_H
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 91af346ed..53a9bcf56 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -53,13 +53,13 @@ template<> struct ei_is_arithmetic<__m128d> { enum { ret = true }; };
   (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
 
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
-  const Packet4f ei_p4f_##NAME = ei_pset1<float>(X)
+  const Packet4f ei_p4f_##NAME = ei_pset1<Packet4f>(X)
 
 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
-  const Packet4f ei_p4f_##NAME = _mm_castsi128_ps(ei_pset1<int>(X))
+  const Packet4f ei_p4f_##NAME = _mm_castsi128_ps(ei_pset1<Packet4i>(X))
 
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
-  const Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
+  const Packet4i ei_p4i_##NAME = ei_pset1<Packet4i>(X)
 
 
 template<> struct ei_packet_traits<float>  : ei_default_packet_traits
@@ -107,11 +107,11 @@ template<> struct ei_unpacket_traits<Packet4i> { typedef int    type; enum {size
 #ifdef __GNUC__
 // Sometimes GCC implements _mm_set1_p* using multiple moves,
 // that is inefficient :( (e.g., see ei_gemm_pack_rhs)
-template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) {
+template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float&  from) {
   Packet4f res = _mm_set_ss(from);
   return ei_vec4f_swizzle1(res,0,0,0,0);
 }
-template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double&  from) {
+template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<Packet2d>(const double&  from) {
 #ifdef EIGEN_VECTORIZE_SSE3
   return _mm_loaddup_pd(&from);
 #else
@@ -120,14 +120,14 @@ template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double&  from) {
 #endif
 }
 #else
-template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) { return _mm_set1_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double& from) { return _mm_set1_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<Packet4f>(const float&  from) { return _mm_set1_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
 #endif
-template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int&    from) { return _mm_set1_epi32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<Packet4i>(const int&    from) { return _mm_set1_epi32(from); }
 
-template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return _mm_add_ps(ei_pset1(a), _mm_set_ps(3,2,1,0)); }
-template<> EIGEN_STRONG_INLINE Packet2d ei_plset<double>(const double& a) { return _mm_add_pd(ei_pset1(a),_mm_set_pd(1,0)); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return _mm_add_epi32(ei_pset1(a),_mm_set_epi32(3,2,1,0)); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return _mm_add_ps(ei_pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_plset<double>(const double& a) { return _mm_add_pd(ei_pset1<Packet2d>(a),_mm_set_pd(1,0)); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return _mm_add_epi32(ei_pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
 
 template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d ei_padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
@@ -174,7 +174,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, con
 template<> EIGEN_STRONG_INLINE Packet2d ei_pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
 { ei_assert(false && "packet integer division are not supported by SSE");
-  return ei_pset1<int>(0);
+  return ei_pset1<Packet4i>(0);
 }
 
 // for some weird raisons, it has to be overloaded for packet of integers
@@ -214,14 +214,14 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a,
 template<> EIGEN_STRONG_INLINE Packet2d ei_pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float*    from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet2d ei_pload<double>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
 
 #if defined(_MSC_VER)
-  template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float*   from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); }
-  template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double*  from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
-  template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
+  template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu<Packet4f>(const float*  from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); }
+  template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
+  template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<Packet4i>(const int*    from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
 #else
 // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
 // require pointer casting to incompatible pointer types and leads to invalid code
@@ -229,7 +229,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_D
 // a correct instruction dependency.
 // TODO: do the same for MSVC (ICC is compatible)
 // NOTE: with the code below, MSVC's compiler crashes!
-template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
+template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu<Packet4f>(const float* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
   __m128d res;
@@ -237,7 +237,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
   res =  _mm_loadh_pd(res, (const double*)(from+2)) ;
   return _mm_castpd_ps(res);
 }
-template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from)
+template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<Packet2d>(const double* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
   __m128d res;
@@ -245,7 +245,7 @@ template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from)
   res = _mm_loadh_pd(res,from+1);
   return res;
 }
-template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)
+template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<Packet4i>(const int* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
   __m128d res;
diff --git a/Eigen/src/Core/products/CoeffBasedProduct.h b/Eigen/src/Core/products/CoeffBasedProduct.h
index 1474bc1bb..ceaeda284 100644
--- a/Eigen/src/Core/products/CoeffBasedProduct.h
+++ b/Eigen/src/Core/products/CoeffBasedProduct.h
@@ -42,7 +42,7 @@
 template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
 struct ei_product_coeff_impl;
 
-template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
+template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
 struct ei_product_packet_impl;
 
 template<typename LhsNested, typename RhsNested, int NestingFlags>
@@ -275,20 +275,20 @@ struct ei_product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar>
 *** Scalar path with inner vectorization ***
 *******************************************/
 
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename PacketScalar>
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet>
 struct ei_product_coeff_vectorized_unroller
 {
   typedef typename Lhs::Index Index;
   enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size };
   EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
   {
-    ei_product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres);
+    ei_product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
     pres = ei_padd(pres, ei_pmul( lhs.template packet<Aligned>(row, UnrollingIndex) , rhs.template packet<Aligned>(UnrollingIndex, col) ));
   }
 };
 
-template<typename Lhs, typename Rhs, typename PacketScalar>
-struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, PacketScalar>
+template<typename Lhs, typename Rhs, typename Packet>
+struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet>
 {
   typedef typename Lhs::Index Index;
   EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
@@ -300,13 +300,13 @@ struct ei_product_coeff_vectorized_unroller<0, Lhs, Rhs, PacketScalar>
 template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
 struct ei_product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
 {
-  typedef typename Lhs::PacketScalar PacketScalar;
+  typedef typename Lhs::PacketScalar Packet;
   typedef typename Lhs::Index Index;
   enum { PacketSize = ei_packet_traits<typename Lhs::Scalar>::size };
   EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
   {
-    PacketScalar pres;
-    ei_product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, pres);
+    Packet pres;
+    ei_product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
     ei_product_coeff_impl<DefaultTraversal,UnrollingIndex,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, res);
     res = ei_predux(pres);
   }
@@ -368,71 +368,71 @@ struct ei_product_coeff_impl<InnerVectorizedTraversal, Dynamic, Lhs, Rhs, RetSca
 *** Packet path  ***
 *******************/
 
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, PacketScalar, LoadMode>
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct ei_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
 {
   typedef typename Lhs::Index Index;
-  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
+  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
   {
-    ei_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
-    res =  ei_pmadd(ei_pset1(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
+    ei_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
+    res =  ei_pmadd(ei_pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
   }
 };
 
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, PacketScalar, LoadMode>
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct ei_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
 {
   typedef typename Lhs::Index Index;
-  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
+  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
   {
-    ei_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
-    res =  ei_pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), ei_pset1(rhs.coeff(UnrollingIndex, col)), res);
+    ei_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
+    res =  ei_pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), ei_pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res);
   }
 };
 
-template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<RowMajor, 0, Lhs, Rhs, PacketScalar, LoadMode>
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct ei_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
   typedef typename Lhs::Index Index;
-  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
+  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
   {
-    res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
+    res = ei_pmul(ei_pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
   }
 };
 
-template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<ColMajor, 0, Lhs, Rhs, PacketScalar, LoadMode>
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct ei_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
 {
   typedef typename Lhs::Index Index;
-  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
+  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
   {
-    res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
+    res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1<Packet>(rhs.coeff(0, col)));
   }
 };
 
-template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct ei_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
   typedef typename Lhs::Index Index;
-  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
+  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
   {
     ei_assert(lhs.cols()>0 && "you are using a non initialized matrix");
-    res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
+    res = ei_pmul(ei_pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
       for(Index i = 1; i < lhs.cols(); ++i)
-        res =  ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
+        res =  ei_pmadd(ei_pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
   }
 };
 
-template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
-struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
 {
   typedef typename Lhs::Index Index;
-  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
+  EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
   {
     ei_assert(lhs.cols()>0 && "you are using a non initialized matrix");
-    res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
+    res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1<Packet>(rhs.coeff(0, col)));
       for(Index i = 1; i < lhs.cols(); ++i)
-        res =  ei_pmadd(lhs.template packet<LoadMode>(row, i), ei_pset1(rhs.coeff(i, col)), res);
+        res =  ei_pmadd(lhs.template packet<LoadMode>(row, i), ei_pset1<Packet>(rhs.coeff(i, col)), res);
   }
 };
 
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 3dae26eee..ffb4cd386 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -133,11 +133,13 @@ inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, st
   computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
 }
 
-#ifdef EIGEN_HAS_FUSE_CJMADD
+// FIXME
+// #ifdef EIGEN_HAS_FUSE_CJMADD
   #define MADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
-#else
-  #define MADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = ei_padd(C,T);
-#endif
+// #else
+  //#define MADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = ei_padd(C,ResPacket(T));
+//   #define MADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); 
+// #endif
 
 // optimized GEneral packed Block * packed Panel product kernel
 template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -152,13 +154,13 @@ struct ei_gebp_kernel
     ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1
   };
 
-  typedef typename ei_packet_traits<LhsScalar>::type  _LhsPacketType;
-  typedef typename ei_packet_traits<RhsScalar>::type  _RhsPacketType;
-  typedef typename ei_packet_traits<ResScalar>::type  _ResPacketType;
+  typedef typename ei_packet_traits<LhsScalar>::type  _LhsPacket;
+  typedef typename ei_packet_traits<RhsScalar>::type  _RhsPacket;
+  typedef typename ei_packet_traits<ResScalar>::type  _ResPacket;
 
-  typedef typename ei_meta_if<Vectorizable,_LhsPacketType,LhsScalar>::ret LhsPacketType;
-  typedef typename ei_meta_if<Vectorizable,_RhsPacketType,RhsScalar>::ret RhsPacketType;
-  typedef typename ei_meta_if<Vectorizable,_ResPacketType,ResScalar>::ret ResPacketType;
+  typedef typename ei_meta_if<Vectorizable,_LhsPacket,LhsScalar>::ret LhsPacket;
+  typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket;
+  typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket;
 
   void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols,
                   Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0)
@@ -166,7 +168,7 @@ struct ei_gebp_kernel
     if(strideA==-1) strideA = depth;
     if(strideB==-1) strideB = depth;
     ei_conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
-    ei_conj_helper<LhsPacketType,RhsPacketType,ConjugateLhs,ConjugateRhs> pcj;
+    ei_conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
     Index packet_cols = (cols/nr) * nr;
     const Index peeled_mc = (rows/mr)*mr;
     const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsPacketSize ? LhsPacketSize : 0);
@@ -183,7 +185,7 @@ struct ei_gebp_kernel
         const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
         Index n = depth*nr;
         for(Index k=0; k<n; k++)
-          ei_pstore(&unpackedB[k*RhsPacketSize], ei_pset1(blB[k]));
+          ei_pstore(&unpackedB[k*RhsPacketSize], ei_pset1<RhsPacket>(blB[k]));
         /*Scalar* dest = unpackedB;
         for(Index k=0; k<n; k+=4*PacketSize)
         {
@@ -197,11 +199,11 @@ struct ei_gebp_kernel
           _mm_prefetch((const char*)(&dest[S+3*G]), _MM_HINT_T0);
           #endif
 
-          PacketType C0[PacketSize], C1[PacketSize], C2[PacketSize], C3[PacketSize];
-          C0[0] = ei_pload(blB+0*PacketSize);
-          C1[0] = ei_pload(blB+1*PacketSize);
-          C2[0] = ei_pload(blB+2*PacketSize);
-          C3[0] = ei_pload(blB+3*PacketSize);
+          RhsPacket C0[PacketSize], C1[PacketSize], C2[PacketSize], C3[PacketSize];
+          C0[0] = ei_pload<RhsPacket>(blB+0*PacketSize);
+          C1[0] = ei_pload<RhsPacket>(blB+1*PacketSize);
+          C2[0] = ei_pload<RhsPacket>(blB+2*PacketSize);
+          C3[0] = ei_pload<RhsPacket>(blB+3*PacketSize);
 
           ei_punpackp(C0);
           ei_punpackp(C1);
@@ -243,15 +245,15 @@ struct ei_gebp_kernel
         // TODO move the res loads to the stores
 
         // gets res block as register
-        ResPacketType C0, C1, C2, C3, C4, C5, C6, C7;
-                  C0 = ei_pset1(ResScalar(0));
-                  C1 = ei_pset1(ResScalar(0));
-        if(nr==4) C2 = ei_pset1(ResScalar(0));
-        if(nr==4) C3 = ei_pset1(ResScalar(0));
-                  C4 = ei_pset1(ResScalar(0));
-                  C5 = ei_pset1(ResScalar(0));
-        if(nr==4) C6 = ei_pset1(ResScalar(0));
-        if(nr==4) C7 = ei_pset1(ResScalar(0));
+        ResPacket C0, C1, C2, C3, C4, C5, C6, C7;
+                  C0 = ei_pset1<ResPacket>(ResScalar(0));
+                  C1 = ei_pset1<ResPacket>(ResScalar(0));
+        if(nr==4) C2 = ei_pset1<ResPacket>(ResScalar(0));
+        if(nr==4) C3 = ei_pset1<ResPacket>(ResScalar(0));
+                  C4 = ei_pset1<ResPacket>(ResScalar(0));
+                  C5 = ei_pset1<ResPacket>(ResScalar(0));
+        if(nr==4) C6 = ei_pset1<ResPacket>(ResScalar(0));
+        if(nr==4) C7 = ei_pset1<ResPacket>(ResScalar(0));
 
         ResScalar* r0 = &res[(j2+0)*resStride + i];
         ResScalar* r1 = r0 + resStride;
@@ -271,106 +273,106 @@ struct ei_gebp_kernel
         {
           if(nr==2)
           {
-            LhsPacketType A0, A1;
-            RhsPacketType B0;
+            LhsPacket A0, A1;
+            RhsPacket B0;
             #ifndef EIGEN_HAS_FUSE_CJMADD
-            ResPacketType T0;
+            RhsPacket T0;
             #endif
 EIGEN_ASM_COMMENT("mybegin");
-            A0 = ei_pload(&blA[0*LhsPacketSize]);
-            A1 = ei_pload(&blA[1*LhsPacketSize]);
-            B0 = ei_pload(&blB[0*RhsPacketSize]);
+            A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]);
+            A1 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]);
             MADD(pcj,A0,B0,C0,T0);
             MADD(pcj,A1,B0,C4,B0);
-            B0 = ei_pload(&blB[1*RhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]);
             MADD(pcj,A0,B0,C1,T0);
             MADD(pcj,A1,B0,C5,B0);
 
-            A0 = ei_pload(&blA[2*LhsPacketSize]);
-            A1 = ei_pload(&blA[3*LhsPacketSize]);
-            B0 = ei_pload(&blB[2*RhsPacketSize]);
+            A0 = ei_pload<LhsPacket>(&blA[2*LhsPacketSize]);
+            A1 = ei_pload<LhsPacket>(&blA[3*LhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[2*RhsPacketSize]);
             MADD(pcj,A0,B0,C0,T0);
             MADD(pcj,A1,B0,C4,B0);
-            B0 = ei_pload(&blB[3*RhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[3*RhsPacketSize]);
             MADD(pcj,A0,B0,C1,T0);
             MADD(pcj,A1,B0,C5,B0);
 
-            A0 = ei_pload(&blA[4*LhsPacketSize]);
-            A1 = ei_pload(&blA[5*LhsPacketSize]);
-            B0 = ei_pload(&blB[4*RhsPacketSize]);
+            A0 = ei_pload<LhsPacket>(&blA[4*LhsPacketSize]);
+            A1 = ei_pload<LhsPacket>(&blA[5*LhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[4*RhsPacketSize]);
             MADD(pcj,A0,B0,C0,T0);
             MADD(pcj,A1,B0,C4,B0);
-            B0 = ei_pload(&blB[5*RhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[5*RhsPacketSize]);
             MADD(pcj,A0,B0,C1,T0);
             MADD(pcj,A1,B0,C5,B0);
 
-            A0 = ei_pload(&blA[6*LhsPacketSize]);
-            A1 = ei_pload(&blA[7*LhsPacketSize]);
-            B0 = ei_pload(&blB[6*RhsPacketSize]);
+            A0 = ei_pload<LhsPacket>(&blA[6*LhsPacketSize]);
+            A1 = ei_pload<LhsPacket>(&blA[7*LhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[6*RhsPacketSize]);
             MADD(pcj,A0,B0,C0,T0);
             MADD(pcj,A1,B0,C4,B0);
-            B0 = ei_pload(&blB[7*RhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[7*RhsPacketSize]);
             MADD(pcj,A0,B0,C1,T0);
             MADD(pcj,A1,B0,C5,B0);
 EIGEN_ASM_COMMENT("myend");
           }
           else
           {
-            LhsPacketType A0, A1;
-            RhsPacketType B0, B1, B2, B3;
+            LhsPacket A0, A1;
+            RhsPacket B0, B1, B2, B3;
             #ifndef EIGEN_HAS_FUSE_CJMADD
-            ResPacketType T0;
+            RhsPacket T0;
             #endif
-            A0 = ei_pload(&blA[0*LhsPacketSize]);
-            A1 = ei_pload(&blA[1*LhsPacketSize]);
-            B0 = ei_pload(&blB[0*RhsPacketSize]);
-            B1 = ei_pload(&blB[1*RhsPacketSize]);
+            A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]);
+            A1 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]);
+            B1 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]);
 
             MADD(pcj,A0,B0,C0,T0);
-            B2 = ei_pload(&blB[2*RhsPacketSize]);
+            B2 = ei_pload<RhsPacket>(&blB[2*RhsPacketSize]);
             MADD(pcj,A1,B0,C4,B0);
-            B3 = ei_pload(&blB[3*RhsPacketSize]);
-            B0 = ei_pload(&blB[4*RhsPacketSize]);
+            B3 = ei_pload<RhsPacket>(&blB[3*RhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[4*RhsPacketSize]);
             MADD(pcj,A0,B1,C1,T0);
             MADD(pcj,A1,B1,C5,B1);
-            B1 = ei_pload(&blB[5*RhsPacketSize]);
+            B1 = ei_pload<RhsPacket>(&blB[5*RhsPacketSize]);
             MADD(pcj,A0,B2,C2,T0);
             MADD(pcj,A1,B2,C6,B2);
-            B2 = ei_pload(&blB[6*RhsPacketSize]);
+            B2 = ei_pload<RhsPacket>(&blB[6*RhsPacketSize]);
             MADD(pcj,A0,B3,C3,T0);
-            A0 = ei_pload(&blA[2*LhsPacketSize]);
+            A0 = ei_pload<LhsPacket>(&blA[2*LhsPacketSize]);
             MADD(pcj,A1,B3,C7,B3);
-            A1 = ei_pload(&blA[3*LhsPacketSize]);
-            B3 = ei_pload(&blB[7*RhsPacketSize]);
+            A1 = ei_pload<LhsPacket>(&blA[3*LhsPacketSize]);
+            B3 = ei_pload<RhsPacket>(&blB[7*RhsPacketSize]);
             MADD(pcj,A0,B0,C0,T0);
             MADD(pcj,A1,B0,C4,B0);
-            B0 = ei_pload(&blB[8*RhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[8*RhsPacketSize]);
             MADD(pcj,A0,B1,C1,T0);
             MADD(pcj,A1,B1,C5,B1);
-            B1 = ei_pload(&blB[9*RhsPacketSize]);
+            B1 = ei_pload<RhsPacket>(&blB[9*RhsPacketSize]);
             MADD(pcj,A0,B2,C2,T0);
             MADD(pcj,A1,B2,C6,B2);
-            B2 = ei_pload(&blB[10*RhsPacketSize]);
+            B2 = ei_pload<RhsPacket>(&blB[10*RhsPacketSize]);
             MADD(pcj,A0,B3,C3,T0);
-            A0 = ei_pload(&blA[4*LhsPacketSize]);
+            A0 = ei_pload<LhsPacket>(&blA[4*LhsPacketSize]);
             MADD(pcj,A1,B3,C7,B3);
-            A1 = ei_pload(&blA[5*LhsPacketSize]);
-            B3 = ei_pload(&blB[11*RhsPacketSize]);
+            A1 = ei_pload<LhsPacket>(&blA[5*LhsPacketSize]);
+            B3 = ei_pload<RhsPacket>(&blB[11*RhsPacketSize]);
 
             MADD(pcj,A0,B0,C0,T0);
             MADD(pcj,A1,B0,C4,B0);
-            B0 = ei_pload(&blB[12*RhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[12*RhsPacketSize]);
             MADD(pcj,A0,B1,C1,T0);
             MADD(pcj,A1,B1,C5,B1);
-            B1 = ei_pload(&blB[13*RhsPacketSize]);
+            B1 = ei_pload<RhsPacket>(&blB[13*RhsPacketSize]);
             MADD(pcj,A0,B2,C2,T0);
             MADD(pcj,A1,B2,C6,B2);
-            B2 = ei_pload(&blB[14*RhsPacketSize]);
+            B2 = ei_pload<RhsPacket>(&blB[14*RhsPacketSize]);
             MADD(pcj,A0,B3,C3,T0);
-            A0 = ei_pload(&blA[6*LhsPacketSize]);
+            A0 = ei_pload<LhsPacket>(&blA[6*LhsPacketSize]);
             MADD(pcj,A1,B3,C7,B3);
-            A1 = ei_pload(&blA[7*LhsPacketSize]);
-            B3 = ei_pload(&blB[15*RhsPacketSize]);
+            A1 = ei_pload<LhsPacket>(&blA[7*LhsPacketSize]);
+            B3 = ei_pload<RhsPacket>(&blB[15*RhsPacketSize]);
             MADD(pcj,A0,B0,C0,T0);
             MADD(pcj,A1,B0,C4,B0);
             MADD(pcj,A0,B1,C1,T0);
@@ -389,38 +391,38 @@ EIGEN_ASM_COMMENT("myend");
         {
           if(nr==2)
           {
-            LhsPacketType A0, A1;
-            RhsPacketType B0;
+            LhsPacket A0, A1;
+            RhsPacket B0;
             #ifndef EIGEN_HAS_FUSE_CJMADD
-            ResPacketType T0;
+            RhsPacket T0;
             #endif
 
-            A0 = ei_pload(&blA[0*LhsPacketSize]);
-            A1 = ei_pload(&blA[1*LhsPacketSize]);
-            B0 = ei_pload(&blB[0*RhsPacketSize]);
+            A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]);
+            A1 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]);
             MADD(pcj,A0,B0,C0,T0);
             MADD(pcj,A1,B0,C4,B0);
-            B0 = ei_pload(&blB[1*RhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]);
             MADD(pcj,A0,B0,C1,T0);
             MADD(pcj,A1,B0,C5,B0);
           }
           else
           {
-            LhsPacketType A0, A1;
-            RhsPacketType B0, B1, B2, B3;
+            LhsPacket A0, A1;
+            RhsPacket B0, B1, B2, B3;
             #ifndef EIGEN_HAS_FUSE_CJMADD
-            ResPacketType T0;
+            RhsPacket T0;
             #endif
 
-            A0 = ei_pload(&blA[0*LhsPacketSize]);
-            A1 = ei_pload(&blA[1*LhsPacketSize]);
-            B0 = ei_pload(&blB[0*RhsPacketSize]);
-            B1 = ei_pload(&blB[1*RhsPacketSize]);
+            A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]);
+            A1 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]);
+            B1 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]);
 
             MADD(pcj,A0,B0,C0,T0);
-            B2 = ei_pload(&blB[2*RhsPacketSize]);
+            B2 = ei_pload<RhsPacket>(&blB[2*RhsPacketSize]);
             MADD(pcj,A1,B0,C4,B0);
-            B3 = ei_pload(&blB[3*RhsPacketSize]);
+            B3 = ei_pload<RhsPacket>(&blB[3*RhsPacketSize]);
             MADD(pcj,A0,B1,C1,T0);
             MADD(pcj,A1,B1,C5,B1);
             MADD(pcj,A0,B2,C2,T0);
@@ -433,16 +435,16 @@ EIGEN_ASM_COMMENT("myend");
           blA += mr;
         }
 
-        ResPacketType R0, R1, R2, R3, R4, R5, R6, R7;
+        ResPacket R0, R1, R2, R3, R4, R5, R6, R7;
 
-                  R0 = ei_ploadu(r0);
-                  R1 = ei_ploadu(r1);
-        if(nr==4) R2 = ei_ploadu(r2);
-        if(nr==4) R3 = ei_ploadu(r3);
-                  R4 = ei_ploadu(r0 + ResPacketSize);
-                  R5 = ei_ploadu(r1 + ResPacketSize);
-        if(nr==4) R6 = ei_ploadu(r2 + ResPacketSize);
-        if(nr==4) R7 = ei_ploadu(r3 + ResPacketSize);
+                  R0 = ei_ploadu<ResPacket>(r0);
+                  R1 = ei_ploadu<ResPacket>(r1);
+        if(nr==4) R2 = ei_ploadu<ResPacket>(r2);
+        if(nr==4) R3 = ei_ploadu<ResPacket>(r3);
+                  R4 = ei_ploadu<ResPacket>(r0 + ResPacketSize);
+                  R5 = ei_ploadu<ResPacket>(r1 + ResPacketSize);
+        if(nr==4) R6 = ei_ploadu<ResPacket>(r2 + ResPacketSize);
+        if(nr==4) R7 = ei_ploadu<ResPacket>(r3 + ResPacketSize);
 
                   C0 = ei_padd(R0, C0);
                   C1 = ei_padd(R1, C1);
@@ -469,11 +471,11 @@ EIGEN_ASM_COMMENT("myend");
         ei_prefetch(&blA[0]);
 
         // gets res block as register
-        ResPacketType C0, C1, C2, C3;
-                  C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
-                  C1 = ei_ploadu(&res[(j2+1)*resStride + i]);
-        if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]);
-        if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]);
+        ResPacket C0, C1, C2, C3;
+                  C0 = ei_ploadu<ResPacket>(&res[(j2+0)*resStride + i]);
+                  C1 = ei_ploadu<ResPacket>(&res[(j2+1)*resStride + i]);
+        if(nr==4) C2 = ei_ploadu<ResPacket>(&res[(j2+2)*resStride + i]);
+        if(nr==4) C3 = ei_ploadu<ResPacket>(&res[(j2+3)*resStride + i]);
 
         // performs "inner" product
         const RhsScalar* blB = unpackedB;
@@ -481,70 +483,70 @@ EIGEN_ASM_COMMENT("myend");
         {
           if(nr==2)
           {
-            LhsPacketType A0;
-            RhsPacketType B0, B1;
+            LhsPacket A0;
+            RhsPacket B0, B1;
 
-            A0 = ei_pload(&blA[0*LhsPacketSize]);
-            B0 = ei_pload(&blB[0*RhsPacketSize]);
-            B1 = ei_pload(&blB[1*RhsPacketSize]);
+            A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]);
+            B1 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]);
             MADD(pcj,A0,B0,C0,B0);
-            B0 = ei_pload(&blB[2*RhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[2*RhsPacketSize]);
             MADD(pcj,A0,B1,C1,B1);
-            A0 = ei_pload(&blA[1*LhsPacketSize]);
-            B1 = ei_pload(&blB[3*RhsPacketSize]);
+            A0 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]);
+            B1 = ei_pload<RhsPacket>(&blB[3*RhsPacketSize]);
             MADD(pcj,A0,B0,C0,B0);
-            B0 = ei_pload(&blB[4*RhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[4*RhsPacketSize]);
             MADD(pcj,A0,B1,C1,B1);
-            A0 = ei_pload(&blA[2*LhsPacketSize]);
-            B1 = ei_pload(&blB[5*RhsPacketSize]);
+            A0 = ei_pload<LhsPacket>(&blA[2*LhsPacketSize]);
+            B1 = ei_pload<RhsPacket>(&blB[5*RhsPacketSize]);
             MADD(pcj,A0,B0,C0,B0);
-            B0 = ei_pload(&blB[6*RhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[6*RhsPacketSize]);
             MADD(pcj,A0,B1,C1,B1);
-            A0 = ei_pload(&blA[3*LhsPacketSize]);
-            B1 = ei_pload(&blB[7*RhsPacketSize]);
+            A0 = ei_pload<LhsPacket>(&blA[3*LhsPacketSize]);
+            B1 = ei_pload<RhsPacket>(&blB[7*RhsPacketSize]);
             MADD(pcj,A0,B0,C0,B0);
             MADD(pcj,A0,B1,C1,B1);
           }
           else
           {
-            LhsPacketType A0;
-            RhsPacketType B0, B1, B2, B3;
+            LhsPacket A0;
+            RhsPacket B0, B1, B2, B3;
 
-            A0 = ei_pload(&blA[0*LhsPacketSize]);
-            B0 = ei_pload(&blB[0*RhsPacketSize]);
-            B1 = ei_pload(&blB[1*RhsPacketSize]);
+            A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]);
+            B1 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]);
 
             MADD(pcj,A0,B0,C0,B0);
-            B2 = ei_pload(&blB[2*RhsPacketSize]);
-            B3 = ei_pload(&blB[3*RhsPacketSize]);
-            B0 = ei_pload(&blB[4*RhsPacketSize]);
+            B2 = ei_pload<RhsPacket>(&blB[2*RhsPacketSize]);
+            B3 = ei_pload<RhsPacket>(&blB[3*RhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[4*RhsPacketSize]);
             MADD(pcj,A0,B1,C1,B1);
-            B1 = ei_pload(&blB[5*RhsPacketSize]);
+            B1 = ei_pload<RhsPacket>(&blB[5*RhsPacketSize]);
             MADD(pcj,A0,B2,C2,B2);
-            B2 = ei_pload(&blB[6*RhsPacketSize]);
+            B2 = ei_pload<RhsPacket>(&blB[6*RhsPacketSize]);
             MADD(pcj,A0,B3,C3,B3);
-            A0 = ei_pload(&blA[1*LhsPacketSize]);
-            B3 = ei_pload(&blB[7*RhsPacketSize]);
+            A0 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]);
+            B3 = ei_pload<RhsPacket>(&blB[7*RhsPacketSize]);
             MADD(pcj,A0,B0,C0,B0);
-            B0 = ei_pload(&blB[8*RhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[8*RhsPacketSize]);
             MADD(pcj,A0,B1,C1,B1);
-            B1 = ei_pload(&blB[9*RhsPacketSize]);
+            B1 = ei_pload<RhsPacket>(&blB[9*RhsPacketSize]);
             MADD(pcj,A0,B2,C2,B2);
-            B2 = ei_pload(&blB[10*RhsPacketSize]);
+            B2 = ei_pload<RhsPacket>(&blB[10*RhsPacketSize]);
             MADD(pcj,A0,B3,C3,B3);
-            A0 = ei_pload(&blA[2*LhsPacketSize]);
-            B3 = ei_pload(&blB[11*RhsPacketSize]);
+            A0 = ei_pload<LhsPacket>(&blA[2*LhsPacketSize]);
+            B3 = ei_pload<RhsPacket>(&blB[11*RhsPacketSize]);
 
             MADD(pcj,A0,B0,C0,B0);
-            B0 = ei_pload(&blB[12*RhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[12*RhsPacketSize]);
             MADD(pcj,A0,B1,C1,B1);
-            B1 = ei_pload(&blB[13*RhsPacketSize]);
+            B1 = ei_pload<RhsPacket>(&blB[13*RhsPacketSize]);
             MADD(pcj,A0,B2,C2,B2);
-            B2 = ei_pload(&blB[14*RhsPacketSize]);
+            B2 = ei_pload<RhsPacket>(&blB[14*RhsPacketSize]);
             MADD(pcj,A0,B3,C3,B3);
 
-            A0 = ei_pload(&blA[3*LhsPacketSize]);
-            B3 = ei_pload(&blB[15*RhsPacketSize]);
+            A0 = ei_pload<LhsPacket>(&blA[3*LhsPacketSize]);
+            B3 = ei_pload<RhsPacket>(&blB[15*RhsPacketSize]);
             MADD(pcj,A0,B0,C0,B0);
             MADD(pcj,A0,B1,C1,B1);
             MADD(pcj,A0,B2,C2,B2);
@@ -559,31 +561,31 @@ EIGEN_ASM_COMMENT("myend");
         {
           if(nr==2)
           {
-            LhsPacketType A0;
-            RhsPacketType B0;
+            LhsPacket A0;
+            RhsPacket B0;
             #ifndef EIGEN_HAS_FUSE_CJMADD
-            ResPacketType T0;
+            RhsPacket T0;
             #endif
 
-            A0 = ei_pload(&blA[0*LhsPacketSize]);
-            B0 = ei_pload(&blB[0*RhsPacketSize]);
+            A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]);
             MADD(pcj,A0,B0,C0,T0);
-            B0 = ei_pload(&blB[1*RhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]);
             MADD(pcj,A0,B0,C1,T0);
           }
           else
           {
-            LhsPacketType A0;
-            RhsPacketType B0, B1, B2, B3;
+            LhsPacket A0;
+            RhsPacket B0, B1, B2, B3;
             #ifndef EIGEN_HAS_FUSE_CJMADD
-            ResPacketType T0, T1;
+            RhsPacket T0, T1;
             #endif
 
-            A0 = ei_pload(&blA[0*LhsPacketSize]);
-            B0 = ei_pload(&blB[0*RhsPacketSize]);
-            B1 = ei_pload(&blB[1*RhsPacketSize]);
-            B2 = ei_pload(&blB[2*RhsPacketSize]);
-            B3 = ei_pload(&blB[3*RhsPacketSize]);
+            A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]);
+            B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]);
+            B1 = ei_pload<RhsPacket>(&blB[1*RhsPacketSize]);
+            B2 = ei_pload<RhsPacket>(&blB[2*RhsPacketSize]);
+            B3 = ei_pload<RhsPacket>(&blB[3*RhsPacketSize]);
 
             MADD(pcj,A0,B0,C0,T0);
             MADD(pcj,A0,B1,C1,T1);
@@ -662,7 +664,7 @@ EIGEN_ASM_COMMENT("myend");
       {
         const RhsScalar* blB = &blockB[j2*strideB+offsetB];
         for(Index k=0; k<depth; k++)
-          ei_pstore(&unpackedB[k*RhsPacketSize], ei_pset1(blB[k]));
+          ei_pstore(&unpackedB[k*RhsPacketSize], ei_pset1<RhsPacket>(blB[k]));
       }
 
       for(Index i=0; i<peeled_mc; i+=mr)
@@ -673,22 +675,22 @@ EIGEN_ASM_COMMENT("myend");
         // TODO move the res loads to the stores
 
         // get res block as registers
-        ResPacketType C0, C4;
-        C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
-        C4 = ei_ploadu(&res[(j2+0)*resStride + i + ResPacketSize]);
+        ResPacket C0, C4;
+        C0 = ei_ploadu<ResPacket>(&res[(j2+0)*resStride + i]);
+        C4 = ei_ploadu<ResPacket>(&res[(j2+0)*resStride + i + ResPacketSize]);
 
         const RhsScalar* blB = unpackedB;
         for(Index k=0; k<depth; k++)
         {
-          LhsPacketType A0, A1;
-          RhsPacketType B0;
+          LhsPacket A0, A1;
+          RhsPacket B0;
           #ifndef EIGEN_HAS_FUSE_CJMADD
-          ResPacketType T0, T1;
+          RhsPacket T0, T1;
           #endif
 
-          A0 = ei_pload(&blA[0*LhsPacketSize]);
-          A1 = ei_pload(&blA[1*LhsPacketSize]);
-          B0 = ei_pload(&blB[0*RhsPacketSize]);
+          A0 = ei_pload<LhsPacket>(&blA[0*LhsPacketSize]);
+          A1 = ei_pload<LhsPacket>(&blA[1*LhsPacketSize]);
+          B0 = ei_pload<RhsPacket>(&blB[0*RhsPacketSize]);
           MADD(pcj,A0,B0,C0,T0);
           MADD(pcj,A1,B0,C4,T1);
 
@@ -705,13 +707,13 @@ EIGEN_ASM_COMMENT("myend");
         const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsPacketSize];
         ei_prefetch(&blA[0]);
 
-        ResPacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
+        ResPacket C0 = ei_ploadu<ResPacket>(&res[(j2+0)*resStride + i]);
 
         const RhsScalar* blB = unpackedB;
         for(Index k=0; k<depth; k++)
         {
-          ResPacketType T0;
-          MADD(pcj,ei_pload(blA), ei_pload(blB), C0, T0);
+          RhsPacket T0;
+          MADD(pcj,ei_pload<LhsPacket>(blA), ei_pload<RhsPacket>(blB), C0, T0);
           blB += RhsPacketSize;
           blA += LhsPacketSize;
         }
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index 0997746ef..e0d71be7e 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -32,54 +32,71 @@
  * same alignment pattern.
  * TODO: since rhs gets evaluated only once, no need to evaluate it
  */
-template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index, typename RhsType>
-static EIGEN_DONT_INLINE
-void ei_cache_friendly_product_colmajor_times_vector(
-  Index size,
-  const Scalar* lhs, Index lhsStride,
-  const RhsType& rhs,
-  Scalar* res,
-  Scalar alpha)
+template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
+struct ei_general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs>
 {
+typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+enum {
+  Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable
+              && ei_packet_traits<LhsScalar>::size==ei_packet_traits<RhsScalar>::size,
+  LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
+  RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1,
+  ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1
+};
+
+typedef typename ei_packet_traits<LhsScalar>::type  _LhsPacket;
+typedef typename ei_packet_traits<RhsScalar>::type  _RhsPacket;
+typedef typename ei_packet_traits<ResScalar>::type  _ResPacket;
+
+typedef typename ei_meta_if<Vectorizable,_LhsPacket,LhsScalar>::ret LhsPacket;
+typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket;
+typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket;
+
+template<typename RhsType>
+EIGEN_DONT_INLINE static void run(
+  Index rows, Index cols,
+  const LhsScalar* lhs, Index lhsStride,
+  const RhsType&/*const RhsScalar**/ rhs, Index rhsIncr,
+  ResScalar* res, Index resIncr,
+  ResScalar alpha)
+{
+  EIGEN_UNUSED_VARIABLE(rhsIncr);
+  ei_internal_assert(resIncr==1);
   #ifdef _EIGEN_ACCUMULATE_PACKETS
   #error _EIGEN_ACCUMULATE_PACKETS has already been defined
   #endif
   #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
     ei_pstore(&res[j], \
-      ei_padd(ei_pload(&res[j]), \
+      ei_padd(ei_pload<ResPacket>(&res[j]), \
         ei_padd( \
-          ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A0)(&lhs0[j]),    ptmp0), \
-                  pcj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs1[j]),   ptmp1)), \
-          ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A2)(&lhs2[j]),    ptmp2), \
-                  pcj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs3[j]),   ptmp3)) )))
-
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef typename ei_packet_traits<Scalar>::type Packet;
-  enum {
-    PacketSize = sizeof(Packet)/sizeof(Scalar),
-    Vectorizable = ei_packet_traits<Scalar>::Vectorizable
-  };
-
-  ei_conj_helper<Scalar,Scalar,ConjugateLhs,ConjugateRhs> cj;
-  ei_conj_helper<Packet,Packet,ConjugateLhs,ConjugateRhs> pcj;
+          ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A0)<LhsPacket>(&lhs0[j]),    ptmp0), \
+                  pcj.pmul(EIGEN_CAT(ei_ploa , A13)<LhsPacket>(&lhs1[j]),   ptmp1)), \
+          ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A2)<LhsPacket>(&lhs2[j]),    ptmp2), \
+                  pcj.pmul(EIGEN_CAT(ei_ploa , A13)<LhsPacket>(&lhs3[j]),   ptmp3)) )))
+
+  ei_conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
+  ei_conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
   if(ConjugateRhs)
     alpha = ei_conj(alpha);
 
   enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
   const Index columnsAtOnce = 4;
   const Index peels = 2;
-  const Index PacketAlignedMask = PacketSize-1;
-  const Index PeelAlignedMask = PacketSize*peels-1;
+  const Index LhsPacketAlignedMask = LhsPacketSize-1;
+  const Index ResPacketAlignedMask = ResPacketSize-1;
+  const Index PeelAlignedMask = ResPacketSize*peels-1;
+  const Index size = rows;
 
   // How many coeffs of the result do we have to skip to be aligned.
   // Here we assume data are at least aligned on the base scalar type.
   Index alignedStart = ei_first_aligned(res,size);
-  Index alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0;
+  Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
   const Index peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
 
-  const Index alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0;
+  const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
   Index alignmentPattern = alignmentStep==0 ? AllAligned
-                       : alignmentStep==(PacketSize/2) ? EvenAligned
+                       : alignmentStep==(LhsPacketSize/2) ? EvenAligned
                        : FirstAligned;
 
   // we cannot assume the first element is aligned because of sub-matrices
@@ -88,19 +105,19 @@ void ei_cache_friendly_product_colmajor_times_vector(
   // find how many columns do we have to skip to be aligned with the result (if possible)
   Index skipColumns = 0;
   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (size_t(lhs)%sizeof(Scalar)) || (size_t(res)%sizeof(Scalar)) )
+  if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
   {
     alignedSize = 0;
     alignedStart = 0;
   }
-  else if (PacketSize>1)
+  else if (LhsPacketSize>1)
   {
-    ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size<PacketSize);
+    ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
 
-    while (skipColumns<PacketSize &&
-          alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%PacketSize))
+    while (skipColumns<LhsPacketSize &&
+          alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
       ++skipColumns;
-    if (skipColumns==PacketSize)
+    if (skipColumns==LhsPacketSize)
     {
       // nothing can be aligned, no need to skip any column
       alignmentPattern = NoneAligned;
@@ -108,14 +125,14 @@ void ei_cache_friendly_product_colmajor_times_vector(
     }
     else
     {
-      skipColumns = std::min(skipColumns,rhs.size());
+      skipColumns = std::min(skipColumns,cols);
       // note that the skiped columns are processed later.
     }
 
     ei_internal_assert(  (alignmentPattern==NoneAligned)
-                      || (skipColumns + columnsAtOnce >= rhs.size())
-                      || PacketSize > size
-                      || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0);
+                      || (skipColumns + columnsAtOnce >= cols)
+                      || LhsPacketSize > size
+                      || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
   }
   else if(Vectorizable)
   {
@@ -127,15 +144,15 @@ void ei_cache_friendly_product_colmajor_times_vector(
   Index offset1 = (FirstAligned && alignmentStep==1?3:1);
   Index offset3 = (FirstAligned && alignmentStep==1?1:3);
 
-  Index columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
+  Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
   for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
   {
-    Packet ptmp0 = ei_pset1(alpha*rhs[i]),   ptmp1 = ei_pset1(alpha*rhs[i+offset1]),
-           ptmp2 = ei_pset1(alpha*rhs[i+2]), ptmp3 = ei_pset1(alpha*rhs[i+offset3]);
+    RhsPacket ptmp0 = ei_pset1<RhsPacket>(alpha*rhs[i]),   ptmp1 = ei_pset1<RhsPacket>(alpha*rhs[i+offset1]),
+              ptmp2 = ei_pset1<RhsPacket>(alpha*rhs[i+2]), ptmp3 = ei_pset1<RhsPacket>(alpha*rhs[i+offset3]);
 
     // this helps a lot generating better binary code
-    const Scalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
-                 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
+    const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
+                    *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
 
     if (Vectorizable)
     {
@@ -154,51 +171,51 @@ void ei_cache_friendly_product_colmajor_times_vector(
         switch(alignmentPattern)
         {
           case AllAligned:
-            for (Index j = alignedStart; j<alignedSize; j+=PacketSize)
+            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
               _EIGEN_ACCUMULATE_PACKETS(d,d,d);
             break;
           case EvenAligned:
-            for (Index j = alignedStart; j<alignedSize; j+=PacketSize)
+            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
               _EIGEN_ACCUMULATE_PACKETS(d,du,d);
             break;
           case FirstAligned:
             if(peels>1)
             {
-              Packet A00, A01, A02, A03, A10, A11, A12, A13;
+              LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
 
-              A01 = ei_pload(&lhs1[alignedStart-1]);
-              A02 = ei_pload(&lhs2[alignedStart-2]);
-              A03 = ei_pload(&lhs3[alignedStart-3]);
+              A01 = ei_pload<LhsPacket>(&lhs1[alignedStart-1]);
+              A02 = ei_pload<LhsPacket>(&lhs2[alignedStart-2]);
+              A03 = ei_pload<LhsPacket>(&lhs3[alignedStart-3]);
 
-              for (Index j = alignedStart; j<peeledSize; j+=peels*PacketSize)
+              for (Index j = alignedStart; j<peeledSize; j+=peels*ResPacketSize)
               {
-                A11 = ei_pload(&lhs1[j-1+PacketSize]);  ei_palign<1>(A01,A11);
-                A12 = ei_pload(&lhs2[j-2+PacketSize]);  ei_palign<2>(A02,A12);
-                A13 = ei_pload(&lhs3[j-3+PacketSize]);  ei_palign<3>(A03,A13);
+                A11 = ei_pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  ei_palign<1>(A01,A11);
+                A12 = ei_pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  ei_palign<2>(A02,A12);
+                A13 = ei_pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  ei_palign<3>(A03,A13);
 
-                A00 = ei_pload (&lhs0[j]);
-                A10 = ei_pload (&lhs0[j+PacketSize]);
-                A00 = pcj.pmadd(A00, ptmp0, ei_pload(&res[j]));
-                A10 = pcj.pmadd(A10, ptmp0, ei_pload(&res[j+PacketSize]));
+                A00 = ei_pload<LhsPacket>(&lhs0[j]);
+                A10 = ei_pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
+                A00 = pcj.pmadd(A00, ptmp0, ei_pload<ResPacket>(&res[j]));
+                A10 = pcj.pmadd(A10, ptmp0, ei_pload<ResPacket>(&res[j+ResPacketSize]));
 
                 A00 = pcj.pmadd(A01, ptmp1, A00);
-                A01 = ei_pload(&lhs1[j-1+2*PacketSize]);  ei_palign<1>(A11,A01);
+                A01 = ei_pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  ei_palign<1>(A11,A01);
                 A00 = pcj.pmadd(A02, ptmp2, A00);
-                A02 = ei_pload(&lhs2[j-2+2*PacketSize]);  ei_palign<2>(A12,A02);
+                A02 = ei_pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  ei_palign<2>(A12,A02);
                 A00 = pcj.pmadd(A03, ptmp3, A00);
                 ei_pstore(&res[j],A00);
-                A03 = ei_pload(&lhs3[j-3+2*PacketSize]);  ei_palign<3>(A13,A03);
+                A03 = ei_pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  ei_palign<3>(A13,A03);
                 A10 = pcj.pmadd(A11, ptmp1, A10);
                 A10 = pcj.pmadd(A12, ptmp2, A10);
                 A10 = pcj.pmadd(A13, ptmp3, A10);
-                ei_pstore(&res[j+PacketSize],A10);
+                ei_pstore(&res[j+ResPacketSize],A10);
               }
             }
-            for (Index j = peeledSize; j<alignedSize; j+=PacketSize)
+            for (Index j = peeledSize; j<alignedSize; j+=ResPacketSize)
               _EIGEN_ACCUMULATE_PACKETS(d,du,du);
             break;
           default:
-            for (Index j = alignedStart; j<alignedSize; j+=PacketSize)
+            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
               _EIGEN_ACCUMULATE_PACKETS(du,du,du);
             break;
         }
@@ -216,14 +233,14 @@ void ei_cache_friendly_product_colmajor_times_vector(
   }
 
   // process remaining first and last columns (at most columnsAtOnce-1)
-  Index end = rhs.size();
+  Index end = cols;
   Index start = columnBound;
   do
   {
     for (Index i=start; i<end; ++i)
     {
-      Packet ptmp0 = ei_pset1(alpha*rhs[i]);
-      const Scalar* lhs0 = lhs + i*lhsStride;
+      RhsPacket ptmp0 = ei_pset1<RhsPacket>(alpha*rhs[i]);
+      const LhsScalar* lhs0 = lhs + i*lhsStride;
 
       if (Vectorizable)
       {
@@ -233,12 +250,12 @@ void ei_cache_friendly_product_colmajor_times_vector(
           res[j] += cj.pmul(lhs0[j], ei_pfirst(ptmp0));
 
         // process aligned result's coeffs
-        if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0)
-          for (Index j = alignedStart;j<alignedSize;j+=PacketSize)
-            ei_pstore(&res[j], pcj.pmadd(ei_pload(&lhs0[j]), ptmp0, ei_pload(&res[j])));
+        if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
+          for (Index j = alignedStart;j<alignedSize;j+=ResPacketSize)
+            ei_pstore(&res[j], pcj.pmadd(ei_pload<LhsPacket>(&lhs0[j]), ptmp0, ei_pload<ResPacket>(&res[j])));
         else
-          for (Index j = alignedStart;j<alignedSize;j+=PacketSize)
-            ei_pstore(&res[j], pcj.pmadd(ei_ploadu(&lhs0[j]), ptmp0, ei_pload(&res[j])));
+          for (Index j = alignedStart;j<alignedSize;j+=ResPacketSize)
+            ei_pstore(&res[j], pcj.pmadd(ei_ploadu<LhsPacket>(&lhs0[j]), ptmp0, ei_pload<ResPacket>(&res[j])));
       }
 
       // process remaining scalars (or all if no explicit vectorization)
@@ -256,15 +273,35 @@ void ei_cache_friendly_product_colmajor_times_vector(
   } while(Vectorizable);
   #undef _EIGEN_ACCUMULATE_PACKETS
 }
+};
 
-// TODO add peeling to mask unaligned load/stores
-template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index>
-static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
+template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
+struct ei_general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs>
+{
+typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+enum {
+  Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable
+              && int(ei_packet_traits<LhsScalar>::size)==int(ei_packet_traits<RhsScalar>::size),
+  LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
+  RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1,
+  ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1
+};
+
+typedef typename ei_packet_traits<LhsScalar>::type  _LhsPacket;
+typedef typename ei_packet_traits<RhsScalar>::type  _RhsPacket;
+typedef typename ei_packet_traits<ResScalar>::type  _ResPacket;
+
+typedef typename ei_meta_if<Vectorizable,_LhsPacket,LhsScalar>::ret LhsPacket;
+typedef typename ei_meta_if<Vectorizable,_RhsPacket,RhsScalar>::ret RhsPacket;
+typedef typename ei_meta_if<Vectorizable,_ResPacket,ResScalar>::ret ResPacket;
+  
+EIGEN_DONT_INLINE static void run(
   Index rows, Index cols,
-  const Scalar* lhs, Index lhsStride,
-  const Scalar* rhs, Index rhsIncr,
-  Scalar* res, Index resIncr,
-  Scalar alpha)
+  const LhsScalar* lhs, Index lhsStride,
+  const RhsScalar* rhs, Index rhsIncr,
+  ResScalar* res, Index resIncr,
+  ResScalar alpha)
 {
   ei_internal_assert(rhsIncr==1);
   #ifdef _EIGEN_ACCUMULATE_PACKETS
@@ -272,39 +309,33 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
   #endif
 
   #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
-    Packet b = ei_pload(&rhs[j]); \
-    ptmp0 = pcj.pmadd(EIGEN_CAT(ei_ploa,A0) (&lhs0[j]), b, ptmp0); \
-    ptmp1 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs1[j]), b, ptmp1); \
-    ptmp2 = pcj.pmadd(EIGEN_CAT(ei_ploa,A2) (&lhs2[j]), b, ptmp2); \
-    ptmp3 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs3[j]), b, ptmp3); }
-
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef typename ei_packet_traits<Scalar>::type Packet;
-  enum {
-    PacketSize = sizeof(Packet)/sizeof(Scalar),
-    Vectorizable = ei_packet_traits<Scalar>::Vectorizable
-  };
-
-  ei_conj_helper<Scalar,Scalar,ConjugateLhs,ConjugateRhs> cj;
-  ei_conj_helper<Packet,Packet,ConjugateLhs,ConjugateRhs> pcj;
+    RhsPacket b = ei_pload<RhsPacket>(&rhs[j]); \
+    ptmp0 = pcj.pmadd(EIGEN_CAT(ei_ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
+    ptmp1 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
+    ptmp2 = pcj.pmadd(EIGEN_CAT(ei_ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
+    ptmp3 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
+
+  ei_conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
+  ei_conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
 
   enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
   const Index rowsAtOnce = 4;
   const Index peels = 2;
-  const Index PacketAlignedMask = PacketSize-1;
-  const Index PeelAlignedMask = PacketSize*peels-1;
+  const Index RhsPacketAlignedMask = RhsPacketSize-1;
+  const Index LhsPacketAlignedMask = LhsPacketSize-1;
+  const Index PeelAlignedMask = RhsPacketSize*peels-1;
   const Index depth = cols;
 
   // How many coeffs of the result do we have to skip to be aligned.
   // Here we assume data are at least aligned on the base scalar type
   // if that's not the case then vectorization is discarded, see below.
   Index alignedStart = ei_first_aligned(rhs, depth);
-  Index alignedSize = PacketSize>1 ? alignedStart + ((depth-alignedStart) & ~PacketAlignedMask) : 0;
+  Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
   const Index peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
 
-  const Index alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0;
+  const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
   Index alignmentPattern = alignmentStep==0 ? AllAligned
-                         : alignmentStep==(PacketSize/2) ? EvenAligned
+                         : alignmentStep==(LhsPacketSize/2) ? EvenAligned
                          : FirstAligned;
 
   // we cannot assume the first element is aligned because of sub-matrices
@@ -313,19 +344,19 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
   // find how many rows do we have to skip to be aligned with rhs (if possible)
   Index skipRows = 0;
   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (size_t(lhs)%sizeof(Scalar)) || (size_t(rhs)%sizeof(Scalar)) )
+  if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
   {
     alignedSize = 0;
     alignedStart = 0;
   }
-  else if (PacketSize>1)
+  else if (LhsPacketSize>1)
   {
-    ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0  || depth<PacketSize);
+    ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
 
-    while (skipRows<PacketSize &&
-           alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%PacketSize))
+    while (skipRows<LhsPacketSize &&
+           alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
       ++skipRows;
-    if (skipRows==PacketSize)
+    if (skipRows==LhsPacketSize)
     {
       // nothing can be aligned, no need to skip any column
       alignmentPattern = NoneAligned;
@@ -337,10 +368,10 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
       // note that the skiped columns are processed later.
     }
     ei_internal_assert(  alignmentPattern==NoneAligned
-                      || PacketSize==1
+                      || LhsPacketSize==1
                       || (skipRows + rowsAtOnce >= rows)
-                      || PacketSize > depth
-                      || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0);
+                      || LhsPacketSize > depth
+                      || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
   }
   else if(Vectorizable)
   {
@@ -355,23 +386,24 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
   Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
   for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
   {
-    EIGEN_ALIGN16 Scalar tmp0 = Scalar(0);
-    Scalar tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0);
+    EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
+    ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
 
     // this helps the compiler generating good binary code
-    const Scalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
-                 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
+    const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
+                    *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
 
     if (Vectorizable)
     {
       /* explicit vectorization */
-      Packet ptmp0 = ei_pset1(Scalar(0)), ptmp1 = ei_pset1(Scalar(0)), ptmp2 = ei_pset1(Scalar(0)), ptmp3 = ei_pset1(Scalar(0));
+      ResPacket ptmp0 = ei_pset1<ResPacket>(ResScalar(0)), ptmp1 = ei_pset1<ResPacket>(ResScalar(0)),
+                ptmp2 = ei_pset1<ResPacket>(ResScalar(0)), ptmp3 = ei_pset1<ResPacket>(ResScalar(0));
 
       // process initial unaligned coeffs
       // FIXME this loop get vectorized by the compiler !
       for (Index j=0; j<alignedStart; ++j)
       {
-        Scalar b = rhs[j];
+        RhsScalar b = rhs[j];
         tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
         tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
       }
@@ -381,11 +413,11 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
         switch(alignmentPattern)
         {
           case AllAligned:
-            for (Index j = alignedStart; j<alignedSize; j+=PacketSize)
+            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
               _EIGEN_ACCUMULATE_PACKETS(d,d,d);
             break;
           case EvenAligned:
-            for (Index j = alignedStart; j<alignedSize; j+=PacketSize)
+            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
               _EIGEN_ACCUMULATE_PACKETS(d,du,d);
             break;
           case FirstAligned:
@@ -397,38 +429,38 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
                * overlaping the desired unaligned packet. This is *much* more efficient
                * than basic unaligned loads.
                */
-              Packet A01, A02, A03, b, A11, A12, A13;
-              A01 = ei_pload(&lhs1[alignedStart-1]);
-              A02 = ei_pload(&lhs2[alignedStart-2]);
-              A03 = ei_pload(&lhs3[alignedStart-3]);
+              LhsPacket A01, A02, A03, A11, A12, A13;
+              A01 = ei_pload<LhsPacket>(&lhs1[alignedStart-1]);
+              A02 = ei_pload<LhsPacket>(&lhs2[alignedStart-2]);
+              A03 = ei_pload<LhsPacket>(&lhs3[alignedStart-3]);
 
-              for (Index j = alignedStart; j<peeledSize; j+=peels*PacketSize)
+              for (Index j = alignedStart; j<peeledSize; j+=peels*RhsPacketSize)
               {
-                b = ei_pload(&rhs[j]);
-                A11 = ei_pload(&lhs1[j-1+PacketSize]);  ei_palign<1>(A01,A11);
-                A12 = ei_pload(&lhs2[j-2+PacketSize]);  ei_palign<2>(A02,A12);
-                A13 = ei_pload(&lhs3[j-3+PacketSize]);  ei_palign<3>(A03,A13);
+                RhsPacket b = ei_pload<RhsPacket>(&rhs[j]);
+                A11 = ei_pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  ei_palign<1>(A01,A11);
+                A12 = ei_pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  ei_palign<2>(A02,A12);
+                A13 = ei_pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  ei_palign<3>(A03,A13);
 
-                ptmp0 = pcj.pmadd(ei_pload (&lhs0[j]), b, ptmp0);
+                ptmp0 = pcj.pmadd(ei_pload<LhsPacket>(&lhs0[j]), b, ptmp0);
                 ptmp1 = pcj.pmadd(A01, b, ptmp1);
-                A01 = ei_pload(&lhs1[j-1+2*PacketSize]);  ei_palign<1>(A11,A01);
+                A01 = ei_pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  ei_palign<1>(A11,A01);
                 ptmp2 = pcj.pmadd(A02, b, ptmp2);
-                A02 = ei_pload(&lhs2[j-2+2*PacketSize]);  ei_palign<2>(A12,A02);
+                A02 = ei_pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  ei_palign<2>(A12,A02);
                 ptmp3 = pcj.pmadd(A03, b, ptmp3);
-                A03 = ei_pload(&lhs3[j-3+2*PacketSize]);  ei_palign<3>(A13,A03);
+                A03 = ei_pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  ei_palign<3>(A13,A03);
 
-                b = ei_pload(&rhs[j+PacketSize]);
-                ptmp0 = pcj.pmadd(ei_pload (&lhs0[j+PacketSize]), b, ptmp0);
+                b = ei_pload<RhsPacket>(&rhs[j+RhsPacketSize]);
+                ptmp0 = pcj.pmadd(ei_pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
                 ptmp1 = pcj.pmadd(A11, b, ptmp1);
                 ptmp2 = pcj.pmadd(A12, b, ptmp2);
                 ptmp3 = pcj.pmadd(A13, b, ptmp3);
               }
             }
-            for (Index j = peeledSize; j<alignedSize; j+=PacketSize)
+            for (Index j = peeledSize; j<alignedSize; j+=RhsPacketSize)
               _EIGEN_ACCUMULATE_PACKETS(d,du,du);
             break;
           default:
-            for (Index j = alignedStart; j<alignedSize; j+=PacketSize)
+            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
               _EIGEN_ACCUMULATE_PACKETS(du,du,du);
             break;
         }
@@ -443,7 +475,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
     // FIXME this loop get vectorized by the compiler !
     for (Index j=alignedSize; j<depth; ++j)
     {
-      Scalar b = rhs[j];
+      RhsScalar b = rhs[j];
       tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
       tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
     }
@@ -460,9 +492,9 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
   {
     for (Index i=start; i<end; ++i)
     {
-      EIGEN_ALIGN16 Scalar tmp0 = Scalar(0);
-      Packet ptmp0 = ei_pset1(tmp0);
-      const Scalar* lhs0 = lhs + i*lhsStride;
+      EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
+      ResPacket ptmp0 = ei_pset1<ResPacket>(tmp0);
+      const LhsScalar* lhs0 = lhs + i*lhsStride;
       // process first unaligned result's coeffs
       // FIXME this loop get vectorized by the compiler !
       for (Index j=0; j<alignedStart; ++j)
@@ -471,12 +503,12 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
       if (alignedSize>alignedStart)
       {
         // process aligned rhs coeffs
-        if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0)
-          for (Index j = alignedStart;j<alignedSize;j+=PacketSize)
-            ptmp0 = pcj.pmadd(ei_pload(&lhs0[j]), ei_pload(&rhs[j]), ptmp0);
+        if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
+          for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
+            ptmp0 = pcj.pmadd(ei_pload<LhsPacket>(&lhs0[j]), ei_pload<RhsPacket>(&rhs[j]), ptmp0);
         else
-          for (Index j = alignedStart;j<alignedSize;j+=PacketSize)
-            ptmp0 = pcj.pmadd(ei_ploadu(&lhs0[j]), ei_pload(&rhs[j]), ptmp0);
+          for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
+            ptmp0 = pcj.pmadd(ei_ploadu<LhsPacket>(&lhs0[j]), ei_pload<RhsPacket>(&rhs[j]), ptmp0);
         tmp0 += ei_predux(ptmp0);
       }
 
@@ -498,5 +530,6 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
 
   #undef _EIGEN_ACCUMULATE_PACKETS
 }
+};
 
 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h
index 4514c7692..8a10075f0 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -77,14 +77,14 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector(
     register const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride;
 
     Scalar t0 = cjAlpha * rhs[j];
-    Packet ptmp0 = ei_pset1(t0);
+    Packet ptmp0 = ei_pset1<Packet>(t0);
     Scalar t1 = cjAlpha * rhs[j+1];
-    Packet ptmp1 = ei_pset1(t1);
+    Packet ptmp1 = ei_pset1<Packet>(t1);
 
     Scalar t2 = 0;
-    Packet ptmp2 = ei_pset1(t2);
+    Packet ptmp2 = ei_pset1<Packet>(t2);
     Scalar t3 = 0;
-    Packet ptmp3 = ei_pset1(t3);
+    Packet ptmp3 = ei_pset1<Packet>(t3);
 
     size_t starti = FirstTriangular ? 0 : j+2;
     size_t endi   = FirstTriangular ? j : size;
@@ -119,10 +119,10 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector(
           Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
     for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize)
     {
-      Packet A0i = ei_ploadu(a0It);  a0It  += PacketSize;
-      Packet A1i = ei_ploadu(a1It);  a1It  += PacketSize;
-      Packet Bi  = ei_ploadu(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases
-      Packet Xi  = ei_pload (resIt);
+      Packet A0i = ei_ploadu<Packet>(a0It);  a0It  += PacketSize;
+      Packet A1i = ei_ploadu<Packet>(a1It);  a1It  += PacketSize;
+      Packet Bi  = ei_ploadu<Packet>(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases
+      Packet Xi  = ei_pload <Packet>(resIt);
 
       Xi    = pcj0.pmadd(A0i,ptmp0, pcj0.pmadd(A1i,ptmp1,Xi));
       ptmp2 = pcj1.pmadd(A0i,  Bi, ptmp2);
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index de4f0b7bb..16b02a425 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -76,12 +76,11 @@ struct ei_product_triangular_vector_selector<true,Lhs,Rhs,Result,Mode,ConjLhs,Co
       if (r>0)
       {
         Index s = IsLower ? pi+actualPanelWidth : 0;
-        ei_cache_friendly_product_colmajor_times_vector<ConjLhs,ConjRhs>(
-            r,
+        ei_general_matrix_vector_product<Index,Scalar,ColMajor,ConjLhs,Scalar,ConjRhs>::run(
+            r, actualPanelWidth,
             &(lhs.const_cast_derived().coeffRef(s,pi)), lhs.outerStride(),
-            rhs.segment(pi, actualPanelWidth),
-            &(res.coeffRef(s)),
-            alpha);
+            rhs.segment(pi, actualPanelWidth), rhs.innerStride(),
+            &res.coeffRef(s), res.innerStride(), alpha);
       }
     }
   }
@@ -119,7 +118,7 @@ struct ei_product_triangular_vector_selector<true,Lhs,Rhs,Result,Mode,ConjLhs,Co
       if (r>0)
       {
         Index s = IsLower ? 0 : pi + actualPanelWidth;
-        ei_cache_friendly_product_rowmajor_times_vector<ConjLhs,ConjRhs,Scalar,Index>(
+        ei_general_matrix_vector_product<Index,Scalar,RowMajor,ConjLhs,Scalar,ConjRhs>::run(
             actualPanelWidth, r,
             &(lhs.const_cast_derived().coeffRef(pi,s)), lhs.outerStride(),
             &(rhs.const_cast_derived().coeffRef(s)), 1,
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 62a6207c6..459446409 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -45,14 +45,21 @@ template<
   int ResStorageOrder>
 struct ei_general_matrix_matrix_product;
 
-template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index, typename RhsType>
-static void ei_cache_friendly_product_colmajor_times_vector(
-  Index size, const Scalar* lhs, Index lhsStride, const RhsType& rhs, Scalar* res, Scalar alpha);
+template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
+struct ei_general_matrix_vector_product;
 
-template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index>
-static void ei_cache_friendly_product_rowmajor_times_vector(
-  Index rows, Index Cols, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsIncr,
-  Scalar* res, Index resIncr, Scalar alpha);
+
+template<bool Conjugate> struct ei_conj_if;
+
+template<> struct ei_conj_if<true> {
+  template<typename T>
+  inline T operator()(const T& x) { return ei_conj(x); }
+};
+
+template<> struct ei_conj_if<false> {
+  template<typename T>
+  inline const T& operator()(const T& x) { return x; }
+};
 
 template<typename Scalar> struct ei_conj_helper<Scalar,Scalar,false,false>
 {
@@ -90,35 +97,24 @@ template<typename RealScalar> struct ei_conj_helper<std::complex<RealScalar>, st
   { return Scalar(ei_real(x)*ei_real(y) - ei_imag(x)*ei_imag(y), - ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); }
 };
 
-template<typename RealScalar> struct ei_conj_helper<std::complex<RealScalar>, RealScalar, false,false>
+template<typename RealScalar,bool Conj> struct ei_conj_helper<std::complex<RealScalar>, RealScalar, Conj,false>
 {
   typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const { return ei_padd(c, ei_pmul(x,y)); }
-
+  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const
+  { return ei_padd(c, pmul(x,y)); }
   EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const RealScalar& y) const
-  { return ei_pmul(x,y); }
+  { return ei_conj_if<Conj>()(x)*y; }
 };
 
-template<typename RealScalar> struct ei_conj_helper<RealScalar, std::complex<RealScalar>, false,false>
+template<typename RealScalar,bool Conj> struct ei_conj_helper<RealScalar, std::complex<RealScalar>, false,Conj>
 {
   typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const { return ei_padd(c, pmul(x,y)); }
-
+  EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const
+  { return ei_padd(c, pmul(x,y)); }
   EIGEN_STRONG_INLINE Scalar pmul(const RealScalar& x, const Scalar& y) const
-  { return x * y; }
-};
-
-template<bool Conjugate> struct ei_conj_if;
-
-template<> struct ei_conj_if<true> {
-  template<typename T>
-  inline T operator()(const T& x) { return ei_conj(x); }
+  { return x*ei_conj_if<Conj>()(y); }
 };
 
-template<> struct ei_conj_if<false> {
-  template<typename T>
-  inline const T& operator()(const T& x) { return x; }
-};
 
 // Lightweight helper class to access matrix coefficients.
 // Yes, this is somehow redundant with Map<>, but this version is much much lighter,
author	Gael Guennebaud <g.gael@free.fr>	2010-07-11 15:48:30 +0200
committer	Gael Guennebaud <g.gael@free.fr>	2010-07-11 15:48:30 +0200
commit	ff96c94043d575e4d0dd477c1ed2487e33f79627 (patch)
tree	5e9736916779fdacd431c2591a3ec1f77333e505 /Eigen
parent	4161b8be6772f2b7338458c9932d7417797966bb (diff)