13 files changed, 553 insertions, 59 deletions
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index ca30c4dce..139132c6b 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -324,6 +324,7 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>
     typedef typename ProductType::ActualRhsType ActualRhsType;
     typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
     typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
+    typedef Map<Matrix<Scalar,Dynamic,1>, Aligned> MappedDest;
 
     ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs());
     ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs());
@@ -342,7 +343,7 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>
     else
     {
       actualDest = ei_aligned_stack_new(Scalar,dest.size());
-      Map<typename Dest::PlainObject>(actualDest, dest.size()) = dest;
+      MappedDest(actualDest, dest.size()) = dest;
     }
 
     ei_cache_friendly_product_colmajor_times_vector
@@ -353,7 +354,7 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>
 
     if (!EvalToDest)
     {
-      dest = Map<typename Dest::PlainObject>(actualDest, dest.size());
+      dest = MappedDest(actualDest, dest.size());
       ei_aligned_stack_delete(Scalar, actualDest, dest.size());
     }
   }
@@ -365,6 +366,7 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true>
   static void run(const ProductType& prod, Dest& dest, typename ProductType::Scalar alpha)
   {
     typedef typename ProductType::Scalar Scalar;
+    typedef typename ProductType::Index Index;
     typedef typename ProductType::ActualLhsType ActualLhsType;
     typedef typename ProductType::ActualRhsType ActualRhsType;
     typedef typename ProductType::_ActualRhsType _ActualRhsType;
@@ -394,9 +396,12 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true>
     }
 
     ei_cache_friendly_product_rowmajor_times_vector
-      <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate>(
+      <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate, Scalar, Index>(
+        actualLhs.rows(), actualLhs.cols(),
         &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(),
-        rhs_data, prod.rhs().size(), dest, actualAlpha);
+        rhs_data, 1,
+        &dest.coeffRef(0,0), dest.innerStride(),
+        actualAlpha);
 
     if (!DirectlyUseRhs) ei_aligned_stack_delete(Scalar, rhs_data, prod.rhs().size());
   }
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index 310e262d8..90ce2a802 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -80,12 +80,13 @@ struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,RowMajor
         // 2 - it is slighlty faster at runtime
         Index startRow = IsLower ? pi : pi-actualPanelWidth;
         Index startCol = IsLower ? 0 : pi;
-        VectorBlock<Rhs,Dynamic> target(other,startRow,actualPanelWidth);
 
-        ei_cache_friendly_product_rowmajor_times_vector<LhsProductTraits::NeedToConjugate,false>(
+        ei_cache_friendly_product_rowmajor_times_vector<LhsProductTraits::NeedToConjugate,false,Scalar,Index>(
+          actualPanelWidth, r,
           &(actualLhs.const_cast_derived().coeffRef(startRow,startCol)), actualLhs.outerStride(),
-          &(other.coeffRef(startCol)), r,
-          target, Scalar(-1));
+          &(other.coeffRef(startCol)), other.innerStride(),
+          &other.coeffRef(startRow), other.innerStride(),
+          Scalar(-1));
       }
 
       for(Index k=0; k<actualPanelWidth; ++k)
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index c66bbb9fa..e2afa23b0 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -90,7 +90,7 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
 
   protected:
 
-    void check_coordinates(Index row, Index col)
+    void check_coordinates(Index row, Index col) const
     {
       EIGEN_ONLY_USED_FOR_DEBUG(row);
       EIGEN_ONLY_USED_FOR_DEBUG(col);
@@ -102,12 +102,12 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
     }
 
     #ifdef EIGEN_INTERNAL_DEBUGGING
-    void check_coordinates_internal(Index row, Index col)
+    void check_coordinates_internal(Index row, Index col) const
     {
       check_coordinates(row, col);
     }
     #else
-    void check_coordinates_internal(Index , Index ) {}
+    void check_coordinates_internal(Index , Index ) const {}
     #endif
 
 };
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
new file mode 100644
index 000000000..2dba95a2f
--- /dev/null
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -0,0 +1,215 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_COMPLEX_ALTIVEC_H
+#define EIGEN_COMPLEX_ALTIVEC_H
+
+static Packet4ui  ei_p4ui_CONJ_XOR = vec_mergeh((Packet4ui)ei_p4i_ZERO, (Packet4ui)ei_p4f_ZERO_);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+static Packet16uc ei_p16uc_COMPLEX_RE   = vec_sld((Packet16uc) vec_splat((Packet4ui)ei_p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)ei_p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc ei_p16uc_COMPLEX_IM   = vec_sld((Packet16uc) vec_splat((Packet4ui)ei_p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)ei_p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+static Packet16uc ei_p16uc_COMPLEX_REV  = vec_sld(ei_p16uc_REVERSE, ei_p16uc_REVERSE, 8);//{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
+static Packet16uc ei_p16uc_COMPLEX_REV2 = vec_sld(ei_p16uc_FORWARD, ei_p16uc_FORWARD, 8);//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+static Packet16uc ei_p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)ei_p16uc_FORWARD, 0), (Packet4ui) vec_splat((Packet4ui)ei_p16uc_FORWARD, 1));//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
+static Packet16uc ei_p16uc_PSET_LO = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)ei_p16uc_FORWARD, 2), (Packet4ui) vec_splat((Packet4ui)ei_p16uc_FORWARD, 3));//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+
+//---------- float ----------
+struct Packet2cf
+{
+  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+  Packet4f  v;
+};
+
+template<> struct ei_packet_traits<std::complex<float> >  : ei_default_packet_traits
+{
+  typedef Packet2cf type;
+  enum {
+    Vectorizable = 1,
+    size = 2,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>&  from)
+{
+  Packet2cf res;
+  /* On AltiVec we cannot load 64-bit registers, so wa have to take care of alignment */
+  if ((ptrdiff_t)&from % 16 == 0) {
+    res.v = ei_pload((const float *)&from);
+    res.v = vec_perm(res.v, res.v, ei_p16uc_PSET_HI);
+  } else {
+    res.v = ei_ploadu((const float *)&from);
+    res.v = vec_perm(res.v, res.v, ei_p16uc_PSET_LO);
+  }
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pnegate(const Packet2cf& a) { return Packet2cf(ei_psub<Packet4f>(ei_p4f_ZERO, a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pconj(const Packet2cf& a) { return Packet2cf((Packet4f)vec_xor((Packet4ui)a.v, ei_p4ui_CONJ_XOR)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  Packet4f v1, v2;
+
+  // Permute and multiply the real parts of a and b
+  v1 = vec_perm(a.v, a.v, ei_p16uc_COMPLEX_RE);
+  // Get the imaginary parts of a
+  v2 = vec_perm(a.v, a.v, ei_p16uc_COMPLEX_IM);
+  // multiply a_re * b 
+  v1 = vec_madd(v1, b.v, ei_p4f_ZERO);
+  // multiply a_im * b and get the conjugate result
+  v2 = vec_madd(v2, b.v, ei_p4f_ZERO);
+  v2 = (Packet4f) vec_xor((Packet4ui)v2, ei_p4ui_CONJ_XOR);
+  // permute back to a proper order
+  v2 = vec_perm(v2, v2, ei_p16uc_COMPLEX_REV);
+  
+  return Packet2cf(vec_add(v1, v2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_or(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_xor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v, vec_nor(b.v,b.v))); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pload <std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(ei_pload((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu<std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ei_ploadu((const float*)from)); }
+
+template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((float*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<float> >(const std::complex<float> *   addr) { vec_dstt((float *)addr, DST_CTRL(2,2,32), DST_CHAN); }
+
+template<> EIGEN_STRONG_INLINE std::complex<float>  ei_pfirst<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> EIGEN_ALIGN16 res[2];
+  ei_pstore((float *)&res, a.v);
+
+  return res[0];
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_preverse(const Packet2cf& a)
+{
+  Packet4f rev_a;
+  rev_a = vec_perm(a.v, a.v, ei_p16uc_COMPLEX_REV2);
+  return Packet2cf(rev_a);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> ei_predux<Packet2cf>(const Packet2cf& a)
+{
+  Packet4f b;
+  b = (Packet4f) vec_sld(a.v, a.v, 8);
+  b = ei_padd(a.v, b);
+  return ei_pfirst(Packet2cf(sum));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_preduxp<Packet2cf>(const Packet2cf* vecs)
+{
+  Packet4f b1, b2;
+  
+  b1 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8);
+  b2 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8);
+  b2 = (Packet4f) vec_sld(b2, b2, 8);
+  b2 = ei_padd(b1, b2);
+
+  return Packet2cf(b2);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> ei_predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  Packet4f b;
+  Packet2cf prod;
+  b = (Packet4f) vec_sld(a.v, a.v, 8);
+  prod = ei_pmul(a, Packet2cf(b));
+
+  return ei_pfirst(prod);
+}
+
+template<int Offset>
+struct ei_palign_impl<Offset,Packet2cf>
+{
+  EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second)
+  {
+    if (Offset==1)
+    {
+      first.v = vec_sld(first.v, second.v, 8);
+    }
+  }
+};
+
+template<> struct ei_conj_helper<Packet2cf, Packet2cf, false,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return ei_padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return ei_pmul(a, ei_pconj(b));
+  }
+};
+
+template<> struct ei_conj_helper<Packet2cf, Packet2cf, true,false>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return ei_padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return ei_pmul(ei_pconj(a), b);
+  }
+};
+
+template<> struct ei_conj_helper<Packet2cf, Packet2cf, true,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return ei_padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return ei_pconj(ei_pmul(a, b));
+  }
+};
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for AltiVec
+  Packet2cf res = ei_conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
+  Packet4f s = vec_madd(b.v, b.v, ei_p4f_ZERO);
+  return Packet2cf(ei_pdiv(res.v, vec_add(s,vec_perm(s, s, ei_p16uc_COMPLEX_REV))));
+}
+
+#endif // EIGEN_COMPLEX_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index f58da60e6..a3ceed8e8 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -74,6 +74,7 @@ typedef __vector unsigned char  Packet16uc;
 static Packet4f ei_p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 };
 static Packet4i ei_p4i_COUNTDOWN = { 3, 2, 1, 0 };
 static Packet16uc ei_p16uc_REVERSE = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
+static Packet16uc ei_p16uc_FORWARD = vec_lvsl(0, (float*)0);
 
 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
new file mode 100644
index 000000000..bf68a2bbb
--- /dev/null
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -0,0 +1,257 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_COMPLEX_ALTIVEC_H
+#define EIGEN_COMPLEX_ALTIVEC_H
+
+static uint32x4_t ei_p4ui_CONJ_XOR = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+static uint32x2_t ei_p2ui_CONJ_XOR = { 0x00000000, 0x80000000 };
+
+//---------- float ----------
+struct Packet2cf
+{
+  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+  Packet4f  v;
+};
+
+template<> struct ei_packet_traits<std::complex<float> >  : ei_default_packet_traits
+{
+  typedef Packet2cf type;
+  enum {
+    Vectorizable = 1,
+    size = 2,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>&  from)
+{
+  float32x2_t r64;
+  r64 = vld1_f32((float *)&from);
+
+  return Packet2cf(vcombine_f32(r64, r64));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(ei_padd<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(ei_psub<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pnegate(const Packet2cf& a) { return Packet2cf(ei_pnegate<Packet4f>(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pconj(const Packet2cf& a)
+{
+  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v), ei_p4ui_CONJ_XOR)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  Packet4f v1, v2;
+  float32x2_t a_lo, a_hi;
+
+  // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
+  v1 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 0), vdup_lane_f32(vget_high_f32(a.v), 0));
+  // Get the real values of a | a1_im | a1_im | a2_im | a2_im |
+  v2 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 1), vdup_lane_f32(vget_high_f32(a.v), 1));
+  // Multiply the real a with b
+  v1 = vmulq_f32(v1, b.v);
+  // Multiply the imag a with b
+  v2 = vmulq_f32(v2, b.v);
+  // Conjugate v2 
+  v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), ei_p4ui_CONJ_XOR));
+  // Swap real/imag elements in v2.
+  a_lo = vrev64_f32(vget_low_f32(v2));
+  a_hi = vrev64_f32(vget_high_f32(v2));
+  v2 = vcombine_f32(a_lo, a_hi);
+  // Add and return the result
+  return Packet2cf(vaddq_f32(v1, v2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet2cf ei_por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pload <std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(ei_pload((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu<std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ei_ploadu((const float*)from)); }
+
+template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((float*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<float> >(const std::complex<float> *   addr) { __pld((float *)addr); }
+
+template<> EIGEN_STRONG_INLINE std::complex<float>  ei_pfirst<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> EIGEN_ALIGN16 x[2];
+  vst1q_f32((float *)x, a.v);
+  return x[0];
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_preverse(const Packet2cf& a)
+{
+  float32x2_t a_lo, a_hi;
+  Packet4f a_r128;
+
+  a_lo = vget_low_f32(a.v);
+  a_hi = vget_high_f32(a.v);
+  a_r128 = vcombine_f32(a_hi, a_lo);
+
+  return Packet2cf(a_r128);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> ei_predux<Packet2cf>(const Packet2cf& a)
+{
+  float32x2_t a1, a2;
+  std::complex<float> s;
+
+  a1 = vget_low_f32(a.v);
+  a2 = vget_high_f32(a.v);
+  a2 = vadd_f32(a1, a2);
+  vst1_f32((float *)&s, a2);
+
+  return s;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_preduxp<Packet2cf>(const Packet2cf* vecs)
+{
+  Packet4f sum1, sum2, sum;
+
+  // Add the first two 64-bit float32x2_t of vecs[0]
+  sum1 = vcombine_f32(vget_low_f32(vecs[0].v), vget_low_f32(vecs[1].v));
+  sum2 = vcombine_f32(vget_high_f32(vecs[0].v), vget_high_f32(vecs[1].v));
+  sum = vaddq_f32(sum1, sum2);
+
+  return Packet2cf(sum);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> ei_predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  float32x2_t a1, a2, v1, v2, prod;
+  std::complex<float> s;
+
+  a1 = vget_low_f32(a.v);
+  a2 = vget_high_f32(a.v);
+   // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
+  v1 = vdup_lane_f32(a1, 0);
+  // Get the real values of a | a1_im | a1_im | a2_im | a2_im |
+  v2 = vdup_lane_f32(a1, 1);
+  // Multiply the real a with b
+  v1 = vmul_f32(v1, a2);
+  // Multiply the imag a with b
+  v2 = vmul_f32(v2, a2);
+  // Conjugate v2 
+  v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), ei_p2ui_CONJ_XOR));
+  // Swap real/imag elements in v2.
+  v2 = vrev64_f32(v2);
+  // Add v1, v2
+  prod = vadd_f32(v1, v2);
+
+  vst1_f32((float *)&s, prod);
+
+  return s;
+}
+
+template<int Offset>
+struct ei_palign_impl<Offset,Packet2cf>
+{
+  EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second)
+  {
+    if (Offset==1)
+    {
+      first.v = vextq_f32(first.v, second.v, 2);
+    }
+  }
+};
+
+template<> struct ei_conj_helper<Packet2cf, Packet2cf, false,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return ei_padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return ei_pmul(a, ei_pconj(b));
+  }
+};
+
+template<> struct ei_conj_helper<Packet2cf, Packet2cf, true,false>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return ei_padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return ei_pmul(ei_pconj(a), b);
+  }
+};
+
+template<> struct ei_conj_helper<Packet2cf, Packet2cf, true,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return ei_padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return ei_pconj(ei_pmul(a, b));
+  }
+};
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for AltiVec
+  Packet2cf res = ei_conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
+  Packet4f s, rev_s;
+  float32x2_t a_lo, a_hi;
+
+  // this computes the norm
+  s = vmulq_f32(b.v, b.v);
+  a_lo = vrev64_f32(vget_low_f32(s));
+  a_hi = vrev64_f32(vget_high_f32(s));
+  rev_s = vcombine_f32(a_lo, a_hi);
+
+  return Packet2cf(ei_pdiv(res.v, vaddq_f32(s,rev_s)));
+}
+
+#endif // EIGEN_COMPLEX_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 97750adbe..5b0d6ab12 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -64,7 +64,8 @@ template<> struct ei_packet_traits<float>  : ei_default_packet_traits
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-
+   
+    HasDiv  = 1,
     // FIXME check the Has*
     HasSin  = 0,
     HasCos  = 0,
@@ -176,8 +177,8 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a,
 template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
 template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
 
-template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)   { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)   { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
 
 template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
 template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index d61e82d00..9d32ede0e 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -275,21 +275,26 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_por    <Packet1cd>(const Packet1cd&
 template<> EIGEN_STRONG_INLINE Packet1cd ei_pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ei_pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(_mm_load_pd((const double*)from)); }
+// FIXME force unaligned load, this is a temporary fix 
+template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu<std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1<std::complex<double> >(const std::complex<double>&  from)
 { /* here we really have to use unaligned loads :( */ return ei_ploadu(&from); }
 
-template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd((double*)to, from.v); }
+// FIXME force unaligned store, this is a temporary fix
+template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstoreu((double*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((double*)to, from.v); }
 
 template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 
 template<> EIGEN_STRONG_INLINE std::complex<double>  ei_pfirst<Packet1cd>(const Packet1cd& a)
 {
-  EIGEN_ALIGN16 double res[2];
-  _mm_store_pd(res, a.v);
-  return *(std::complex<double>*)res;
+//   EIGEN_ALIGN16 double res[2];
+//   _mm_store_pd(res, a.v);
+//   return *(std::complex<double>*)res;
+  EIGEN_ALIGN16 std::complex<double> res;
+  ei_pstore(&res, a);
+  return res;
 }
 
 template<> EIGEN_STRONG_INLINE Packet1cd ei_preverse(const Packet1cd& a) { return a; }
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index bfbc74370..91af346ed 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -112,8 +112,12 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) {
   return ei_vec4f_swizzle1(res,0,0,0,0);
 }
 template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double&  from) {
+#ifdef EIGEN_VECTORIZE_SSE3
+  return _mm_loaddup_pd(&from);
+#else
   Packet2d res = _mm_set_sd(from);
   return ei_vec2d_swizzle1(res, 0, 0);
+#endif
 }
 #else
 template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) { return _mm_set1_ps(from); }
@@ -267,13 +271,13 @@ template<> EIGEN_STRONG_INLINE void ei_prefetch<float>(const float*   addr) { _m
 template<> EIGEN_STRONG_INLINE void ei_prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 template<> EIGEN_STRONG_INLINE void ei_prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 
-#if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64)
+#if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64) && !defined(__INTEL_COMPILER)
 // The temporary variable fixes an internal compilation error.
 // Direct of the struct members fixed bug #62.
 template<> EIGEN_STRONG_INLINE float  ei_pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
 template<> EIGEN_STRONG_INLINE double ei_pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
 template<> EIGEN_STRONG_INLINE int    ei_pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
-#elif defined(_MSC_VER) && (_MSC_VER <= 1500)
+#elif defined(_MSC_VER) && (_MSC_VER <= 1500) && !defined(__INTEL_COMPILER)
 // The temporary variable fixes an internal compilation error.
 template<> EIGEN_STRONG_INLINE float  ei_pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
 template<> EIGEN_STRONG_INLINE double ei_pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index e671a657e..0997746ef 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -258,13 +258,15 @@ void ei_cache_friendly_product_colmajor_times_vector(
 }
 
 // TODO add peeling to mask unaligned load/stores
-template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index, typename ResType>
+template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index>
 static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
+  Index rows, Index cols,
   const Scalar* lhs, Index lhsStride,
-  const Scalar* rhs, Index rhsSize,
-  ResType& res,
+  const Scalar* rhs, Index rhsIncr,
+  Scalar* res, Index resIncr,
   Scalar alpha)
 {
+  ei_internal_assert(rhsIncr==1);
   #ifdef _EIGEN_ACCUMULATE_PACKETS
   #error _EIGEN_ACCUMULATE_PACKETS has already been defined
   #endif
@@ -291,22 +293,22 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
   const Index peels = 2;
   const Index PacketAlignedMask = PacketSize-1;
   const Index PeelAlignedMask = PacketSize*peels-1;
-  const Index size = rhsSize;
+  const Index depth = cols;
 
   // How many coeffs of the result do we have to skip to be aligned.
   // Here we assume data are at least aligned on the base scalar type
   // if that's not the case then vectorization is discarded, see below.
-  Index alignedStart = ei_first_aligned(rhs, size);
-  Index alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0;
+  Index alignedStart = ei_first_aligned(rhs, depth);
+  Index alignedSize = PacketSize>1 ? alignedStart + ((depth-alignedStart) & ~PacketAlignedMask) : 0;
   const Index peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
 
   const Index alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0;
   Index alignmentPattern = alignmentStep==0 ? AllAligned
-                       : alignmentStep==(PacketSize/2) ? EvenAligned
-                       : FirstAligned;
+                         : alignmentStep==(PacketSize/2) ? EvenAligned
+                         : FirstAligned;
 
   // we cannot assume the first element is aligned because of sub-matrices
-  const Index lhsAlignmentOffset = ei_first_aligned(lhs,size);
+  const Index lhsAlignmentOffset = ei_first_aligned(lhs,depth);
 
   // find how many rows do we have to skip to be aligned with rhs (if possible)
   Index skipRows = 0;
@@ -318,7 +320,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
   }
   else if (PacketSize>1)
   {
-    ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0  || size<PacketSize);
+    ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0  || depth<PacketSize);
 
     while (skipRows<PacketSize &&
            alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%PacketSize))
@@ -331,26 +333,26 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
     }
     else
     {
-      skipRows = std::min(skipRows,Index(res.size()));
+      skipRows = std::min(skipRows,Index(rows));
       // note that the skiped columns are processed later.
     }
     ei_internal_assert(  alignmentPattern==NoneAligned
                       || PacketSize==1
-                      || (skipRows + rowsAtOnce >= res.size())
-                      || PacketSize > rhsSize
+                      || (skipRows + rowsAtOnce >= rows)
+                      || PacketSize > depth
                       || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0);
   }
   else if(Vectorizable)
   {
     alignedStart = 0;
-    alignedSize = size;
+    alignedSize = depth;
     alignmentPattern = AllAligned;
   }
 
   Index offset1 = (FirstAligned && alignmentStep==1?3:1);
   Index offset3 = (FirstAligned && alignmentStep==1?1:3);
 
-  Index rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
+  Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
   for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
   {
     EIGEN_ALIGN16 Scalar tmp0 = Scalar(0);
@@ -439,17 +441,20 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
 
     // process remaining coeffs (or all if no explicit vectorization)
     // FIXME this loop get vectorized by the compiler !
-    for (Index j=alignedSize; j<size; ++j)
+    for (Index j=alignedSize; j<depth; ++j)
     {
       Scalar b = rhs[j];
       tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
       tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
     }
-    res[i] += alpha*tmp0; res[i+offset1] += alpha*tmp1; res[i+2] += alpha*tmp2; res[i+offset3] += alpha*tmp3;
+    res[i*resIncr]            += alpha*tmp0;
+    res[(i+offset1)*resIncr]  += alpha*tmp1;
+    res[(i+2)*resIncr]        += alpha*tmp2;
+    res[(i+offset3)*resIncr]  += alpha*tmp3;
   }
 
   // process remaining first and last rows (at most columnsAtOnce-1)
-  Index end = res.size();
+  Index end = rows;
   Index start = rowBound;
   do
   {
@@ -477,9 +482,9 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
 
       // process remaining scalars
       // FIXME this loop get vectorized by the compiler !
-      for (Index j=alignedSize; j<size; ++j)
+      for (Index j=alignedSize; j<depth; ++j)
         tmp0 += cj.pmul(lhs0[j], rhs[j]);
-      res[i] += alpha*tmp0;
+      res[i*resIncr] += alpha*tmp0;
     }
     if (skipRows)
     {
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index abf8097d0..de4f0b7bb 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -119,11 +119,11 @@ struct ei_product_triangular_vector_selector<true,Lhs,Rhs,Result,Mode,ConjLhs,Co
       if (r>0)
       {
         Index s = IsLower ? 0 : pi + actualPanelWidth;
-        Block<Result,Dynamic,1> target(res,pi,0,actualPanelWidth,1);
-        ei_cache_friendly_product_rowmajor_times_vector<ConjLhs,ConjRhs>(
+        ei_cache_friendly_product_rowmajor_times_vector<ConjLhs,ConjRhs,Scalar,Index>(
+            actualPanelWidth, r,
             &(lhs.const_cast_derived().coeffRef(pi,s)), lhs.outerStride(),
-            &(rhs.const_cast_derived().coeffRef(s)), r,
-            target, alpha);
+            &(rhs.const_cast_derived().coeffRef(s)), 1,
+            &res.coeffRef(pi,0), res.innerStride(), alpha);
       }
     }
   }
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 1b7d03722..62a6207c6 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -49,9 +49,10 @@ template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index,
 static void ei_cache_friendly_product_colmajor_times_vector(
   Index size, const Scalar* lhs, Index lhsStride, const RhsType& rhs, Scalar* res, Scalar alpha);
 
-template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index, typename ResType>
+template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index>
 static void ei_cache_friendly_product_rowmajor_times_vector(
-  const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsSize, ResType& res, Scalar alpha);
+  Index rows, Index Cols, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsIncr,
+  Scalar* res, Index resIncr, Scalar alpha);
 
 template<typename Scalar> struct ei_conj_helper<Scalar,Scalar,false,false>
 {
@@ -107,6 +108,17 @@ template<typename RealScalar> struct ei_conj_helper<RealScalar, std::complex<Rea
   { return x * y; }
 };
 
+template<bool Conjugate> struct ei_conj_if;
+
+template<> struct ei_conj_if<true> {
+  template<typename T>
+  inline T operator()(const T& x) { return ei_conj(x); }
+};
+
+template<> struct ei_conj_if<false> {
+  template<typename T>
+  inline const T& operator()(const T& x) { return x; }
+};
 
 // Lightweight helper class to access matrix coefficients.
 // Yes, this is somehow redundant with Map<>, but this version is much much lighter,
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 9d73ef51c..b01ceafb2 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -222,16 +222,4 @@ template<typename T> struct ei_is_diagonal<DiagonalWrapper<T> >
 template<typename T, int S> struct ei_is_diagonal<DiagonalMatrix<T,S> >
 { enum { ret = true }; };
 
-template<bool Conjugate> struct ei_conj_if;
-
-template<> struct ei_conj_if<true> {
-  template<typename T>
-  inline T operator()(const T& x) { return ei_conj(x); }
-};
-
-template<> struct ei_conj_if<false> {
-  template<typename T>
-  inline const T& operator()(const T& x) { return x; }
-};
-
 #endif // EIGEN_META_H