3 files changed, 103 insertions, 85 deletions
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index d63a7aa95..44fde3dcf 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -69,7 +69,7 @@ struct ProductReturnType<Lhs,Rhs,CacheFriendlyProduct>
   typedef typename ei_nested<Rhs,1,
                              typename ei_plain_matrix_type_column_major<Rhs>::type
                    >::type RhsNested;
-  
+
   typedef Product<LhsNested, RhsNested, CacheFriendlyProduct> Type;
 };
 
@@ -268,7 +268,9 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
       */
     EIGEN_STRONG_INLINE bool _useCacheFriendlyProduct() const
     {
-      return  m_lhs.cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+      #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
+      // TODO do something more accurate here
+      return m_lhs.cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
               && (  rows()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
                  || cols()>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD);
     }
@@ -624,7 +626,7 @@ struct ei_cache_friendly_product_selector<ProductType,LhsRows,ColMajor,HasDirect
 
   typedef typename LhsProductTraits::ActualXprType ActualLhsType;
   typedef typename RhsProductTraits::ActualXprType ActualRhsType;
-  
+
   template<typename DestDerived>
   inline static void run(DestDerived& res, const ProductType& product, typename ProductType::Scalar alpha)
   {
@@ -633,7 +635,7 @@ struct ei_cache_friendly_product_selector<ProductType,LhsRows,ColMajor,HasDirect
 
     Scalar actualAlpha = alpha * LhsProductTraits::extractScalarFactor(product.lhs())
                                * RhsProductTraits::extractScalarFactor(product.rhs());
-                             
+
     enum {
       EvalToRes = (ei_packet_traits<Scalar>::size==1)
                 ||((DestDerived::Flags&ActualPacketAccessBit) && (!(DestDerived::Flags & RowMajorBit))) };
@@ -645,7 +647,7 @@ struct ei_cache_friendly_product_selector<ProductType,LhsRows,ColMajor,HasDirect
       _res = ei_aligned_stack_new(Scalar,res.size());
       Map<Matrix<Scalar,DestDerived::RowsAtCompileTime,1> >(_res, res.size()) = res;
     }
-
+//     std::cerr << "colmajor * vector " << EvalToRes << "\n";
     ei_cache_friendly_product_colmajor_times_vector
       <LhsProductTraits::NeedToConjugate,RhsProductTraits::NeedToConjugate>(
       res.size(),
@@ -706,7 +708,7 @@ struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,LhsAccess,RhsCo
       _res = ei_aligned_stack_new(Scalar, res.size());
       Map<Matrix<Scalar,DestDerived::SizeAtCompileTime,1> >(_res, res.size()) = res;
     }
-              
+
     ei_cache_friendly_product_colmajor_times_vector
       <RhsProductTraits::NeedToConjugate,LhsProductTraits::NeedToConjugate>(res.size(),
       &actualRhs.const_cast_derived().coeffRef(0,0), actualRhs.stride(),
@@ -725,7 +727,7 @@ template<typename ProductType, int LhsRows, int RhsOrder, int RhsAccess>
 struct ei_cache_friendly_product_selector<ProductType,LhsRows,RowMajor,HasDirectAccess,1,RhsOrder,RhsAccess>
 {
   typedef typename ProductType::Scalar Scalar;
-  
+
   typedef ei_product_factor_traits<typename ei_traits<ProductType>::_LhsNested> LhsProductTraits;
   typedef ei_product_factor_traits<typename ei_traits<ProductType>::_RhsNested> RhsProductTraits;
 
@@ -753,7 +755,7 @@ struct ei_cache_friendly_product_selector<ProductType,LhsRows,RowMajor,HasDirect
       _rhs = ei_aligned_stack_new(Scalar, actualRhs.size());
       Map<Matrix<Scalar,ActualRhsType::SizeAtCompileTime,1> >(_rhs, actualRhs.size()) = actualRhs;
     }
-              
+
     ei_cache_friendly_product_rowmajor_times_vector
       <LhsProductTraits::NeedToConjugate,RhsProductTraits::NeedToConjugate>(
         &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.stride(),
@@ -774,7 +776,7 @@ struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,LhsAccess,RhsCo
 
   typedef typename LhsProductTraits::ActualXprType ActualLhsType;
   typedef typename RhsProductTraits::ActualXprType ActualRhsType;
-  
+
   enum {
       UseLhsDirectly = ((ei_packet_traits<Scalar>::size==1) || (ActualLhsType::Flags&ActualPacketAccessBit))
                      && (ActualLhsType::Flags & RowMajorBit) };
@@ -796,7 +798,7 @@ struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,LhsAccess,RhsCo
       _lhs = ei_aligned_stack_new(Scalar, actualLhs.size());
       Map<Matrix<Scalar,ActualLhsType::SizeAtCompileTime,1> >(_lhs, actualLhs.size()) = actualLhs;
     }
-              
+
     ei_cache_friendly_product_rowmajor_times_vector
       <RhsProductTraits::NeedToConjugate, LhsProductTraits::NeedToConjugate>(
         &actualRhs.const_cast_derived().coeffRef(0,0), actualRhs.stride(),
@@ -825,11 +827,12 @@ template<typename Derived>
 template<typename Lhs,typename Rhs>
 inline Derived&
 MatrixBase<Derived>::operator+=(const Flagged<Product<Lhs,Rhs,CacheFriendlyProduct>, 0, EvalBeforeNestingBit | EvalBeforeAssigningBit>& other)
-{
+{//std::cerr << "operator+=\n";
   if (other._expression()._useCacheFriendlyProduct())
     ei_cache_friendly_product_selector<Product<Lhs,Rhs,CacheFriendlyProduct> >::run(const_cast_derived(), other._expression(), Scalar(1));
-  else
+  else { //std::cerr << "no cf\n";
     lazyAssign(derived() + other._expression());
+  }
   return derived();
 }
 
@@ -893,7 +896,7 @@ inline void Product<Lhs,Rhs,ProductMode>::_cacheFriendlyEvalAndAdd(DestDerived&
 
   typedef typename LhsProductTraits::ActualXprType ActualLhsType;
   typedef typename RhsProductTraits::ActualXprType ActualRhsType;
-  
+
   const ActualLhsType& actualLhs = LhsProductTraits::extract(m_lhs);
   const ActualRhsType& actualRhs = RhsProductTraits::extract(m_rhs);
 
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index 8a6bf3af3..f3567c96c 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -25,7 +25,9 @@
 #ifndef EIGEN_SOLVETRIANGULAR_H
 #define EIGEN_SOLVETRIANGULAR_H
 
-template<typename Lhs, typename Rhs, int Mode,
+template<typename Lhs, typename Rhs,
+  int Mode, // Upper/Lower | UnitDiag
+//   bool ConjugateLhs, bool ConjugateRhs,
   int UpLo = (Mode & LowerTriangularBit)
            ? LowerTriangular
            : (Mode & UpperTriangularBit)
@@ -33,15 +35,57 @@ template<typename Lhs, typename Rhs, int Mode,
            : -1,
   int StorageOrder = int(Lhs::Flags) & RowMajorBit
   >
-struct ei_solve_triangular_selector;
+struct ei_triangular_solver_selector;
 
 // forward substitution, row-major
-template<typename Lhs, typename Rhs, int Mode, int UpLo>
-struct ei_solve_triangular_selector<Lhs,Rhs,Mode,UpLo,RowMajor>
+template<typename Lhs, typename Rhs, int Mode, /*bool ConjugateLhs, bool ConjugateRhs,*/ int UpLo>
+struct ei_triangular_solver_selector<Lhs,Rhs,Mode,/*ConjugateLhs,ConjugateRhs,*/UpLo,RowMajor>
 {
   typedef typename Rhs::Scalar Scalar;
   static void run(const Lhs& lhs, Rhs& other)
-  {
+  {std::cerr << "here\n";
+    #if NOTDEF
+    const bool IsLowerTriangular = (UpLo==LowerTriangular);
+    const int size = lhs.cols();
+    for(int c=0 ; c<other.cols() ; ++c)
+    {
+      const int PanelWidth = 4;
+      for(int pi=IsLowerTriangular ? 0 : size;
+          IsLowerTriangular ? pi<size : pi>0;
+          IsLowerTriangular ? pi+=PanelWidth : pi-=PanelWidth)
+      {
+        int actualPanelWidth = std::min(IsLowerTriangular ? size - pi : pi, PanelWidth);
+        int startBlock = IsLowerTriangular ? pi : pi-actualPanelWidth;
+        int endBlock = IsLowerTriangular ? pi + actualPanelWidth : 0;
+
+        if (pi > 0)
+        {
+          int r = IsLowerTriangular ? size - endBlock : startBlock; // remaining size
+          ei_cache_friendly_product_colmajor_times_vector<false,false>(
+            r,
+            &(lhs.const_cast_derived().coeffRef(endBlock,startBlock)), lhs.stride(),
+            other.col(c).segment(startBlock, actualPanelWidth),
+            &(other.coeffRef(endBlock, c)),
+            Scalar(-1));
+        }
+
+        for(int k=0; k<actualPanelWidth; ++k)
+        {
+          int i = IsLowerTriangular ? pi+k : pi-k-1;
+          if(!(Mode & UnitDiagBit))
+            other.coeffRef(i,c) /= lhs.coeff(i,i);
+
+          int r = actualPanelWidth - k - 1; // remaining size
+          if (r>0)
+          {
+            other.col(c).segment((IsLowerTriangular ? i+1 : i-r), r) -=
+                  other.coeffRef(i,c)
+                * Block<Lhs,Dynamic,1>(lhs, (IsLowerTriangular ? i+1 : i-r), i, r, 1);
+          }
+        }
+      }
+    }
+    #else
     const bool IsLowerTriangular = (UpLo==LowerTriangular);
     const int size = lhs.cols();
     /* We perform the inverse product per block of 4 rows such that we perfectly match
@@ -115,6 +159,7 @@ struct ei_solve_triangular_selector<Lhs,Rhs,Mode,UpLo,RowMajor>
         }
       }
     }
+    #endif
   }
 };
 
@@ -124,7 +169,7 @@ struct ei_solve_triangular_selector<Lhs,Rhs,Mode,UpLo,RowMajor>
 //  - inv(UpperTriangular,         ColMajor) * Column vector
 //  - inv(UpperTriangular,UnitDiag,ColMajor) * Column vector
 template<typename Lhs, typename Rhs, int Mode, int UpLo>
-struct ei_solve_triangular_selector<Lhs,Rhs,Mode,UpLo,ColMajor>
+struct ei_triangular_solver_selector<Lhs,Rhs,Mode,UpLo,ColMajor>
 {
   typedef typename Rhs::Scalar Scalar;
   typedef typename ei_packet_traits<Scalar>::type Packet;
@@ -132,77 +177,44 @@ struct ei_solve_triangular_selector<Lhs,Rhs,Mode,UpLo,ColMajor>
 
   static void run(const Lhs& lhs, Rhs& other)
   {
+    static const int PanelWidth = 4; // TODO make this a user definable constant
     static const bool IsLowerTriangular = (UpLo==LowerTriangular);
     const int size = lhs.cols();
     for(int c=0 ; c<other.cols() ; ++c)
     {
-      /* let's perform the inverse product per block of 4 columns such that we perfectly match
-       * our optimized matrix * vector product. blockyEnd represents the number of rows
-       * we can process using the block version.
-       */
-      int blockyEnd = (std::max(size-5,0)/4)*4;
-      if (!IsLowerTriangular)
-        blockyEnd = size-1 - blockyEnd;
-      for(int i=IsLowerTriangular ? 0 : size-1; IsLowerTriangular ? i<blockyEnd : i>blockyEnd;)
+      for(int pi=IsLowerTriangular ? 0 : size;
+          IsLowerTriangular ? pi<size : pi>0;
+          IsLowerTriangular ? pi+=PanelWidth : pi-=PanelWidth)
       {
-        /* Let's process the 4x4 sub-matrix as usual.
-         * btmp stores the diagonal coefficients used to update the remaining part of the result.
-         */
-        int startBlock = i;
-        int endBlock = startBlock + (IsLowerTriangular ? 4 : -4);
-        Matrix<Scalar,4,1> btmp;
-        for (;IsLowerTriangular ? i<endBlock : i>endBlock;
-             i += IsLowerTriangular ? 1 : -1)
+        int actualPanelWidth = std::min(IsLowerTriangular ? size - pi : pi, PanelWidth);
+        int startBlock = IsLowerTriangular ? pi : pi-actualPanelWidth;
+        int endBlock = IsLowerTriangular ? pi + actualPanelWidth : 0;
+
+        for(int k=0; k<actualPanelWidth; ++k)
         {
+          int i = IsLowerTriangular ? pi+k : pi-k-1;
           if(!(Mode & UnitDiagBit))
             other.coeffRef(i,c) /= lhs.coeff(i,i);
-          int remainingSize = IsLowerTriangular ? endBlock-i-1 : i-endBlock-1;
-          if (remainingSize>0)
-            other.col(c).segment((IsLowerTriangular ? i : endBlock) + 1, remainingSize) -=
-                other.coeffRef(i,c)
-              * Block<Lhs,Dynamic,1>(lhs, (IsLowerTriangular ? i : endBlock) + 1, i, remainingSize, 1);
-          btmp.coeffRef(IsLowerTriangular ? i-startBlock : remainingSize) = -other.coeffRef(i,c);
-        }
-
-        /* Now we can efficiently update the remaining part of the result as a matrix * vector product.
-         * NOTE in order to reduce both compilation time and binary size, let's directly call
-         * the fast product implementation. It is equivalent to the following code:
-         *   other.col(c).end(size-endBlock) += (lhs.block(endBlock, startBlock, size-endBlock, endBlock-startBlock)
-         *                                       * other.col(c).block(startBlock,endBlock-startBlock)).lazy();
-         */
-        // FIXME this is cool but what about conjugate/adjoint expressions ? do we want to evaluate them ?
-        // this is a more general problem though.
-        ei_cache_friendly_product_colmajor_times_vector(
-          IsLowerTriangular ? size-endBlock : endBlock+1,
-          &(lhs.const_cast_derived().coeffRef(IsLowerTriangular ? endBlock : 0, IsLowerTriangular ? startBlock : endBlock+1)),
-          lhs.stride(),
-          btmp, &(other.coeffRef(IsLowerTriangular ? endBlock : 0, c)),
-          Scalar(1));
-// 				if (IsLowerTriangular)
-//           other.col(c).end(size-endBlock) += (lhs.block(endBlock, startBlock, size-endBlock, endBlock-startBlock)
-//                                           * other.col(c).block(startBlock,endBlock-startBlock)).lazy();
-// 				else
-//           other.col(c).end(size-endBlock) += (lhs.block(endBlock, startBlock, size-endBlock, endBlock-startBlock)
-//                                           * other.col(c).block(startBlock,endBlock-startBlock)).lazy();
-      }
-
-      /* Now we have to process the remaining part as usual */
-      int i;
-      for(i=blockyEnd; IsLowerTriangular ? i<size-1 : i>0; i += (IsLowerTriangular ? 1 : -1) )
-      {
-        if(!(Mode & UnitDiagBit))
-          other.coeffRef(i,c) /= lhs.coeff(i,i);
 
-        /* NOTE we cannot use lhs.col(i).end(size-i-1) because Part::coeffRef gets called by .col() to
-         * get the address of the start of the row
-         */
-        if(IsLowerTriangular)
-          other.col(c).end(size-i-1) -= other.coeffRef(i,c) * Block<Lhs,Dynamic,1>(lhs, i+1,i, size-i-1,1);
-        else
-          other.col(c).start(i) -= other.coeffRef(i,c) * Block<Lhs,Dynamic,1>(lhs, 0,i, i, 1);
+          int r = actualPanelWidth - k - 1; // remaining size
+          if (r>0)
+          {
+            other.col(c).segment((IsLowerTriangular ? i+1 : i-r), r) -=
+                  other.coeffRef(i,c)
+                * Block<Lhs,Dynamic,1>(lhs, (IsLowerTriangular ? i+1 : i-r), i, r, 1);
+          }
+        }
+        int r = IsLowerTriangular ? size - endBlock : startBlock; // remaining size
+        if (r > 0)
+        {
+          ei_cache_friendly_product_colmajor_times_vector<false,false>(
+            r,
+            &(lhs.const_cast_derived().coeffRef(endBlock,startBlock)), lhs.stride(),
+            other.col(c).segment(startBlock, actualPanelWidth),
+            &(other.coeffRef(endBlock, c)),
+            Scalar(-1));
+        }
       }
-      if(!(Mode & UnitDiagBit))
-        other.coeffRef(i,c) /= lhs.coeff(i,i);
     }
   }
 };
@@ -232,7 +244,7 @@ void TriangularView<MatrixType,Mode>::solveInPlace(const MatrixBase<RhsDerived>&
     typename ei_plain_matrix_type_column_major<RhsDerived>::type, RhsDerived&>::ret RhsCopy;
   RhsCopy rhsCopy(rhs);
 
-  ei_solve_triangular_selector<MatrixType, typename ei_unref<RhsCopy>::type, Mode>::run(_expression(), rhsCopy);
+  ei_triangular_solver_selector<MatrixType, typename ei_unref<RhsCopy>::type, Mode>::run(_expression(), rhsCopy);
 
   if (copy)
     rhs = rhsCopy;
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index 851bf808f..61b2cc67c 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -57,6 +57,8 @@ void ei_cache_friendly_product_colmajor_times_vector(
   if(ConjugateRhs)
     alpha = ei_conj(alpha);
 
+//   std::cerr << "prod " << size << " " << rhs.size() << "\n";
+
   typedef typename ei_packet_traits<Scalar>::type Packet;
   const int PacketSize = sizeof(Packet)/sizeof(Scalar);
 
@@ -101,7 +103,10 @@ void ei_cache_friendly_product_colmajor_times_vector(
       // note that the skiped columns are processed later.
     }
 
-    ei_internal_assert((alignmentPattern==NoneAligned) || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0);
+    ei_internal_assert(  (alignmentPattern==NoneAligned)
+                      || (skipColumns + columnsAtOnce >= rhs.size())
+                      || PacketSize > size
+                      || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0);
   }
 
   int offset1 = (FirstAligned && alignmentStep==1?3:1);
@@ -127,7 +132,6 @@ void ei_cache_friendly_product_colmajor_times_vector(
         res[j] = cj.pmadd(lhs1[j], ei_pfirst(ptmp1), res[j]);
         res[j] = cj.pmadd(lhs2[j], ei_pfirst(ptmp2), res[j]);
         res[j] = cj.pmadd(lhs3[j], ei_pfirst(ptmp3), res[j]);
-//         res[j] += ei_pfirst(ptmp0)*lhs0[j] + ei_pfirst(ptmp1)*lhs1[j] + ei_pfirst(ptmp2)*lhs2[j] + ei_pfirst(ptmp3)*lhs3[j];
       }
 
       if (alignedSize>alignedStart)
@@ -193,7 +197,6 @@ void ei_cache_friendly_product_colmajor_times_vector(
       res[j] = cj.pmadd(lhs1[j], ei_pfirst(ptmp1), res[j]);
       res[j] = cj.pmadd(lhs2[j], ei_pfirst(ptmp2), res[j]);
       res[j] = cj.pmadd(lhs3[j], ei_pfirst(ptmp3), res[j]);
-//       res[j] += ei_pfirst(ptmp0)*lhs0[j] + ei_pfirst(ptmp1)*lhs1[j] + ei_pfirst(ptmp2)*lhs2[j] + ei_pfirst(ptmp3)*lhs3[j];
     }
   }
 
@@ -212,7 +215,7 @@ void ei_cache_friendly_product_colmajor_times_vector(
         /* explicit vectorization */
         // process first unaligned result's coeffs
         for (int j=0; j<alignedStart; ++j)
-          res[j] = cj.pmul(lhs0[j], ei_pfirst(ptmp0));
+          res[j] += cj.pmul(lhs0[j], ei_pfirst(ptmp0));
 
         // process aligned result's coeffs
         if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0)