1 files changed, 163 insertions, 28 deletions
diff --git a/Eigen/src/Core/InverseProduct.h b/Eigen/src/Core/InverseProduct.h
index 0ee54a3fb..87f426af5 100755
--- a/Eigen/src/Core/InverseProduct.h
+++ b/Eigen/src/Core/InverseProduct.h
@@ -25,51 +25,186 @@
 #ifndef EIGEN_INVERSEPRODUCT_H
 #define EIGEN_INVERSEPRODUCT_H
 
+template<typename Lhs, typename Rhs,
+  int TriangularPart = (int(Lhs::Flags) & LowerTriangularBit)
+                     ? Lower
+                     : (int(Lhs::Flags) & UpperTriangularBit)
+                     ? Upper
+                     : -1,
+  int StorageOrder = int(Lhs::Flags) & RowMajorBit ? RowMajor : ColMajor
+  >
+struct ei_trisolve_selector;
 
-/** "in-place" version of MatrixBase::inverseProduct() where the result is written in \a other
-  *
-  * \sa inverseProduct()
-  */
-template<typename Derived>
-template<typename OtherDerived>
-void MatrixBase<Derived>::inverseProductInPlace(MatrixBase<OtherDerived>& other) const
+// forward substitution, row-major
+template<typename Lhs, typename Rhs>
+struct ei_trisolve_selector<Lhs,Rhs,Lower,RowMajor>
 {
-  ei_assert(cols() == other.rows());
-  ei_assert(!(Flags & ZeroDiagBit));
-  ei_assert(Flags & (UpperTriangularBit|LowerTriangularBit));
-
-  for(int c=0 ; c<other.cols() ; ++c)
+  typedef typename Rhs::Scalar Scalar;
+  static void run(const Lhs& lhs, Rhs& other)
   {
-    if(Flags & LowerTriangularBit)
+    for(int c=0 ; c<other.cols() ; ++c)
     {
-      // forward substitution
-      if(!(Flags & UnitDiagBit))
-        other.coeffRef(0,c) = other.coeff(0,c)/coeff(0, 0);
-      for(int i=1; i<rows(); ++i)
+      if(!(Lhs::Flags & UnitDiagBit))
+        other.coeffRef(0,c) = other.coeff(0,c)/lhs.coeff(0, 0);
+      for(int i=1; i<lhs.rows(); ++i)
       {
-        Scalar tmp = other.coeff(i,c) - ((this->row(i).start(i)) * other.col(c).start(i)).coeff(0,0);
-        if (Flags & UnitDiagBit)
+        Scalar tmp = other.coeff(i,c) - ((lhs.row(i).start(i)) * other.col(c).start(i)).coeff(0,0);
+        if (Lhs::Flags & UnitDiagBit)
           other.coeffRef(i,c) = tmp;
         else
-          other.coeffRef(i,c) = tmp/coeff(i,i);
+          other.coeffRef(i,c) = tmp/lhs.coeff(i,i);
       }
     }
-    else
+  }
+};
+
+// backward substitution, row-major
+template<typename Lhs, typename Rhs>
+struct ei_trisolve_selector<Lhs,Rhs,Upper,RowMajor>
+{
+  typedef typename Rhs::Scalar Scalar;
+  static void run(const Lhs& lhs, Rhs& other)
+  {
+    const int size = lhs.cols();
+    for(int c=0 ; c<other.cols() ; ++c)
     {
-      // backward substitution
-      if(!(Flags & UnitDiagBit))
-        other.coeffRef(cols()-1,c) = other.coeff(cols()-1, c)/coeff(rows()-1, cols()-1);
-      for(int i=rows()-2 ; i>=0 ; --i)
+      if(!(Lhs::Flags & UnitDiagBit))
+        other.coeffRef(size-1,c) = other.coeff(size-1, c)/lhs.coeff(size-1, size-1);
+      for(int i=size-2 ; i>=0 ; --i)
       {
         Scalar tmp = other.coeff(i,c)
-                   - ((this->row(i).end(cols()-i-1)) * other.col(c).end(cols()-i-1)).coeff(0,0);
-        if (Flags & UnitDiagBit)
+                   - ((lhs.row(i).end(size-i-1)) * other.col(c).end(size-i-1)).coeff(0,0);
+        if (Lhs::Flags & UnitDiagBit)
           other.coeffRef(i,c) = tmp;
         else
-          other.coeffRef(i,c) = tmp/coeff(i,i);
+          other.coeffRef(i,c) = tmp/lhs.coeff(i,i);
       }
     }
   }
+};
+
+// forward substitution, col-major
+template<typename Lhs, typename Rhs>
+struct ei_trisolve_selector<Lhs,Rhs,Lower,ColMajor>
+{
+  typedef typename Rhs::Scalar Scalar;
+  typedef typename ei_packet_traits<Scalar>::type Packet;
+  enum {PacketSize =  ei_packet_traits<Scalar>::size};
+
+  static void run(const Lhs& lhs, Rhs& other)
+  {
+    const int size = lhs.cols();
+    for(int c=0 ; c<other.cols() ; ++c)
+    {
+      /* let's perform the inverse product per block of 4 columns such that we perfectly match
+       * our optimized matrix * vector product.
+       */
+      int blockyEnd = (std::max(size-5,0)/4)*4;
+      for(int i=0; i<blockyEnd;)
+      {
+        int startBlock = i;
+        int endBlock = startBlock+4;
+        Matrix<Scalar,4,1> btmp;
+        /* Let's process the 4x4 sub-matrix as usual.
+         * btmp stores the diagonal coefficients used to update the remaining part of the result.
+         */
+        for (;i<endBlock;++i)
+        {
+          if(!(Lhs::Flags & UnitDiagBit))
+            other.coeffRef(i,c) /= lhs.coeff(i,i);
+          int remainingSize = endBlock-i-1;
+          if (remainingSize>0)
+            other.col(c).block(i+1,remainingSize) -= other.coeffRef(i,c) * Block<Lhs,Dynamic,1>(lhs, i+1, i, remainingSize, 1);
+          btmp.coeffRef(i-startBlock) = -other.coeffRef(i,c);
+        }
+
+        /* Now we can efficiently update the remaining part of the result as a matrix * vector product.
+         * NOTE in order to reduce both compilation time and binary size, let's directly call
+         * the fast product implementation. It is equivalent to the following code:
+         *   other.col(c).end(size-endBlock) += (lhs.block(endBlock, startBlock, size-endBlock, endBlock-startBlock)
+         *                                       * other.col(c).block(startBlock,endBlock-startBlock)).lazy();
+         */
+        ei_cache_friendly_product_colmajor_times_vector(
+          size-endBlock, &(lhs.const_cast_derived().coeffRef(endBlock,startBlock)), lhs.stride(),
+          btmp, &(other.coeffRef(endBlock,c)));
+      }
+
+      /* Now we have to process the remaining part as usual */
+      int i;
+      for(i=blockyEnd; i<size-1; ++i)
+      {
+        if(!(Lhs::Flags & UnitDiagBit))
+          other.coeffRef(i,c) /= lhs.coeff(i,i);
+        // NOTE we cannot use lhs.col(i).end(size-i-1) because Part::coeffRef gets called by .col() to
+        // get the address of the start of the row
+        other.col(c).end(size-i-1) -= other.coeffRef(i,c) * Block<Lhs,Dynamic,1>(lhs, i+1,i, size-i-1,1);
+      }
+      if(!(Lhs::Flags & UnitDiagBit))
+        other.coeffRef(i,c) /= lhs.coeff(i,i);
+    }
+  }
+};
+
+// backward substitution, col-major
+template<typename Lhs, typename Rhs>
+struct ei_trisolve_selector<Lhs,Rhs,Upper,ColMajor>
+{
+  typedef typename Rhs::Scalar Scalar;
+  static void run(const Lhs& lhs, Rhs& other)
+  {
+    const int size = lhs.cols();
+    for(int c=0 ; c<other.cols() ; ++c)
+    {
+      int blockyEnd = size-1 - (std::max(size-5,0)/4)*4;
+      for(int i=size-1; i>blockyEnd;)
+      {
+        int startBlock = i;
+        int endBlock = startBlock-4;
+        Matrix<Scalar,4,1> btmp;
+        /* Let's process the 4x4 sub-matrix as usual.
+         * btmp stores the diagonal coefficients used to update the remaining part of the result.
+         */
+        for (; i>endBlock; --i)
+        {
+          if(!(Lhs::Flags & UnitDiagBit))
+            other.coeffRef(i,c) /= lhs.coeff(i,i);
+          int remainingSize = i-endBlock-1;
+          if (remainingSize>0)
+            other.col(c).block(endBlock+1,remainingSize) -= other.coeffRef(i,c) * Block<Lhs,Dynamic,1>(lhs, endBlock+1, i, remainingSize, 1);
+          btmp.coeffRef(remainingSize) = -other.coeffRef(i,c);
+        }
+
+        ei_cache_friendly_product_colmajor_times_vector(
+          endBlock+1, &(lhs.const_cast_derived().coeffRef(0,endBlock+1)), lhs.stride(),
+          btmp, &(other.coeffRef(0,c)));
+      }
+
+      for(int i=blockyEnd; i>0; --i)
+      {
+        if(!(Lhs::Flags & UnitDiagBit))
+          other.coeffRef(i,c) /= lhs.coeff(i,i);
+        other.col(c).start(i) -= other.coeffRef(i,c) * Block<Lhs,Dynamic,1>(lhs, 0,i, i, 1);
+      }
+      if(!(Lhs::Flags & UnitDiagBit))
+        other.coeffRef(0,c) /= lhs.coeff(0,0);
+    }
+  }
+};
+
+/** "in-place" version of MatrixBase::inverseProduct() where the result is written in \a other
+  *
+  * \sa inverseProduct()
+  */
+template<typename Derived>
+template<typename OtherDerived>
+void MatrixBase<Derived>::inverseProductInPlace(MatrixBase<OtherDerived>& other) const
+{
+  ei_assert(derived().cols() == derived().rows());
+  ei_assert(derived().cols() == other.rows());
+  ei_assert(!(Flags & ZeroDiagBit));
+  ei_assert(Flags & (UpperTriangularBit|LowerTriangularBit));
+
+  ei_trisolve_selector<Derived, OtherDerived>::run(derived(), other.derived());
 }
 
 /** \returns the product of the inverse of \c *this with \a other, \a *this being triangular.