From f8678272a4244babe25cc92bbb9298ed922330a4 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sun, 11 Jul 2010 23:57:23 +0200
Subject: mixing types step 3: - improve support of colmajor by vector and
 matrix - matrix - now all configurations are well handled, but the perf are
 not always very good

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 48 +++++++++++++++++------
 Eigen/src/Core/products/GeneralMatrixMatrix.h     | 10 +++--
 Eigen/src/Core/products/GeneralMatrixVector.h     | 43 ++++++++++++++------
 3 files changed, 73 insertions(+), 28 deletions(-)

(limited to 'Eigen/src/Core/products')
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 9214582ed..e1c258ff0 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -140,17 +140,26 @@ inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, st
 #ifdef EIGEN_HAS_FUSE_CJMADD
   #define MADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
 #else
-  #define MADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = ei_padd(C,ResPacket(T));
+  #define MADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = ei_padd(C,T);
 #endif
 
-// optimized GEneral packed Block * packed Panel product kernel
+/* optimized GEneral packed Block * packed Panel product kernel
+ * 
+ * Mixing type logic: C += A * B
+ *  |  A  |  B  | comments
+ *  |real |cplx | no vectorization yet, would require to pack A with duplication
+ *  |cplx |real | easy vectorization
+ */
 template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
 struct ei_gebp_kernel
 {
   typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
   enum {
-    Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable,
+    Vectorizable = ei_product_blocking_traits<LhsScalar,RhsScalar>::Vectorizable /*ei_packet_traits<LhsScalar>::Vectorizable
+                && ei_packet_traits<RhsScalar>::Vectorizable
+                && (ei_is_same_type<LhsScalar,RhsScalar>::ret
+                || (NumTraits<LhsScalar>::IsComplex && !NumTraits<RhsScalar>::IsComplex))*/,
     LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
     RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1,
     ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1
@@ -766,36 +775,51 @@ template<typename Scalar, typename Index, int mr, int StorageOrder, bool Conjuga
 struct ei_gemm_pack_lhs
 {
   void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows,
-                  Index stride=0, Index offset=0)
+                  Scalar alpha = Scalar(1), Index stride=0, Index offset=0)
   {
     enum { PacketSize = ei_packet_traits<Scalar>::size };
     ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
     ei_conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
     ei_const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride);
+    bool hasAlpha = alpha != Scalar(1);
     Index count = 0;
     Index peeled_mc = (rows/mr)*mr;
     for(Index i=0; i<peeled_mc; i+=mr)
     {
       if(PanelMode) count += mr * offset;
-      for(Index k=0; k<depth; k++)
-        for(Index w=0; w<mr; w++)
-          blockA[count++] = cj(lhs(i+w, k));
+      if(hasAlpha)
+        for(Index k=0; k<depth; k++)
+          for(Index w=0; w<mr; w++)
+            blockA[count++] = alpha * cj(lhs(i+w, k));
+      else
+        for(Index k=0; k<depth; k++)
+          for(Index w=0; w<mr; w++)
+            blockA[count++] = cj(lhs(i+w, k));
       if(PanelMode) count += mr * (stride-offset-depth);
     }
     if(rows-peeled_mc>=PacketSize)
     {
       if(PanelMode) count += PacketSize*offset;
-      for(Index k=0; k<depth; k++)
-        for(Index w=0; w<PacketSize; w++)
-          blockA[count++] = cj(lhs(peeled_mc+w, k));
+      if(hasAlpha)
+        for(Index k=0; k<depth; k++)
+          for(Index w=0; w<PacketSize; w++)
+            blockA[count++] = alpha * cj(lhs(peeled_mc+w, k));
+      else
+        for(Index k=0; k<depth; k++)
+          for(Index w=0; w<PacketSize; w++)
+            blockA[count++] = cj(lhs(peeled_mc+w, k));
       if(PanelMode) count += PacketSize * (stride-offset-depth);
       peeled_mc += PacketSize;
     }
     for(Index i=peeled_mc; i<rows; i++)
     {
       if(PanelMode) count += offset;
-      for(Index k=0; k<depth; k++)
-        blockA[count++] = cj(lhs(i, k));
+      if(hasAlpha)
+        for(Index k=0; k<depth; k++)
+          blockA[count++] = alpha * cj(lhs(i, k));
+      else
+        for(Index k=0; k<depth; k++)
+          blockA[count++] = cj(lhs(i, k));
       if(PanelMode) count += (stride-offset-depth);
     }
   }
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index f2f4ae506..b89cfa8a7 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -74,6 +74,7 @@ static void run(Index rows, Index cols, Index depth,
   ei_const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
 
   typedef ei_product_blocking_traits<LhsScalar,RhsScalar> Blocking;
+  bool alphaOnLhs = NumTraits<LhsScalar>::IsComplex && !NumTraits<RhsScalar>::IsComplex;
 
   Index kc = blocking.kc();                 // cache block size along the K direction
   Index mc = std::min(rows,blocking.mc());  // cache block size along the M direction
@@ -86,7 +87,7 @@ static void run(Index rows, Index cols, Index depth,
   ei_gemm_pack_rhs<RhsScalar, Index, Blocking::nr, RhsStorageOrder, ConjugateRhs> pack_rhs;
   ei_gebp_kernel<LhsScalar, RhsScalar, Index, Blocking::mr, Blocking::nr> gebp;
 
-//   if (ConjugateRhs)
+//   if ((ConjugateRhs && !alphaOnLhs) || (ConjugateLhs && alphaOnLhs))
 //     alpha = ei_conj(alpha);
 //   ei_gemm_pack_lhs<LhsScalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs;
 //   ei_gemm_pack_rhs<RhsScalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs;
@@ -177,6 +178,9 @@ static void run(Index rows, Index cols, Index depth,
     RhsScalar *blockB = blocking.blockB()==0 ? ei_aligned_stack_new(RhsScalar, sizeB) : blocking.blockB();
     RhsScalar *blockW = blocking.blockW()==0 ? ei_aligned_stack_new(RhsScalar, sizeW) : blocking.blockW();
 
+    LhsScalar lhsAlpha = alphaOnLhs ? ei_get_factor<ResScalar,LhsScalar>::run(alpha) : LhsScalar(1);
+    RhsScalar rhsAlpha = alphaOnLhs ? RhsScalar(1) : ei_get_factor<ResScalar,RhsScalar>::run(alpha);
+
     // For each horizontal panel of the rhs, and corresponding panel of the lhs...
     // (==GEMM_VAR1)
     for(Index k2=0; k2<depth; k2+=kc)
@@ -187,7 +191,7 @@ static void run(Index rows, Index cols, Index depth,
       // => Pack rhs's panel into a sequential chunk of memory (L2 caching)
       // Note that this panel will be read as many times as the number of blocks in the lhs's
       // vertical panel which is, in practice, a very low number.
-      pack_rhs(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
+      pack_rhs(blockB, &rhs(k2,0), rhsStride, rhsAlpha, actual_kc, cols);
 
 
       // For each mc x kc block of the lhs's vertical panel...
@@ -199,7 +203,7 @@ static void run(Index rows, Index cols, Index depth,
         // We pack the lhs's block into a sequential chunk of memory (L1 caching)
         // Note that this block will be read a very high number of times, which is equal to the number of
         // micro vertical panel of the large rhs's panel (e.g., cols/4 times).
-        pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
+        pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc, lhsAlpha);
 
         // Everything is packed, we can now call the block * panel kernel:
         gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, -1, -1, 0, 0, blockW);
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index d772834a2..4d2f82680 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -30,7 +30,13 @@
  * the number of load/stores of the result by a factor 4 and to reduce
  * the instruction dependency. Moreover, we know that all bands have the
  * same alignment pattern.
- * TODO: since rhs gets evaluated only once, no need to evaluate it
+ *
+ * Mixing type logic: C += alpha * A * B
+ *  |  A  |  B  |alpha| comments
+ *  |real |cplx |cplx | no vectorization
+ *  |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
+ *  |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
+ *  |cplx |real |real | optimal case, vectorization possible via real-cplx mul
  */
 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
 struct ei_general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs>
@@ -39,7 +45,7 @@ typedef typename ei_scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResS
 
 enum {
   Vectorizable = ei_packet_traits<LhsScalar>::Vectorizable && ei_packet_traits<RhsScalar>::Vectorizable
-              && ei_packet_traits<LhsScalar>::size==ei_packet_traits<RhsScalar>::size,
+              && int(ei_packet_traits<LhsScalar>::size)==int(ei_packet_traits<RhsScalar>::size),
   LhsPacketSize = Vectorizable ? ei_packet_traits<LhsScalar>::size : 1,
   RhsPacketSize = Vectorizable ? ei_packet_traits<RhsScalar>::size : 1,
   ResPacketSize = Vectorizable ? ei_packet_traits<ResScalar>::size : 1
@@ -58,7 +64,7 @@ EIGEN_DONT_INLINE static void run(
   const LhsScalar* lhs, Index lhsStride,
   const RhsScalar* rhs, Index rhsIncr,
   ResScalar* res, Index resIncr,
-  ResScalar alpha)
+  RhsScalar alpha)
 {
   ei_internal_assert(resIncr==1);
   #ifdef _EIGEN_ACCUMULATE_PACKETS
@@ -182,6 +188,7 @@ EIGEN_DONT_INLINE static void run(
             if(peels>1)
             {
               LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
+              ResPacket T0, T1;
 
               A01 = ei_pload<LhsPacket>(&lhs1[alignedStart-1]);
               A02 = ei_pload<LhsPacket>(&lhs2[alignedStart-2]);
@@ -195,20 +202,20 @@ EIGEN_DONT_INLINE static void run(
 
                 A00 = ei_pload<LhsPacket>(&lhs0[j]);
                 A10 = ei_pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
-                A00 = pcj.pmadd(A00, ptmp0, ei_pload<ResPacket>(&res[j]));
-                A10 = pcj.pmadd(A10, ptmp0, ei_pload<ResPacket>(&res[j+ResPacketSize]));
+                T0  = pcj.pmadd(A00, ptmp0, ei_pload<ResPacket>(&res[j]));
+                T1  = pcj.pmadd(A10, ptmp0, ei_pload<ResPacket>(&res[j+ResPacketSize]));
 
-                A00 = pcj.pmadd(A01, ptmp1, A00);
+                T0  = pcj.pmadd(A01, ptmp1, T0);
                 A01 = ei_pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  ei_palign<1>(A11,A01);
-                A00 = pcj.pmadd(A02, ptmp2, A00);
+                T0  = pcj.pmadd(A02, ptmp2, T0);
                 A02 = ei_pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  ei_palign<2>(A12,A02);
-                A00 = pcj.pmadd(A03, ptmp3, A00);
-                ei_pstore(&res[j],A00);
+                T0  = pcj.pmadd(A03, ptmp3, T0);
+                ei_pstore(&res[j],T0);
                 A03 = ei_pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  ei_palign<3>(A13,A03);
-                A10 = pcj.pmadd(A11, ptmp1, A10);
-                A10 = pcj.pmadd(A12, ptmp2, A10);
-                A10 = pcj.pmadd(A13, ptmp3, A10);
-                ei_pstore(&res[j+ResPacketSize],A10);
+                T1  = pcj.pmadd(A11, ptmp1, T1);
+                T1  = pcj.pmadd(A12, ptmp2, T1);
+                T1  = pcj.pmadd(A13, ptmp3, T1);
+                ei_pstore(&res[j+ResPacketSize],T1);
               }
             }
             for (Index j = peeledSize; j<alignedSize; j+=ResPacketSize)
@@ -275,6 +282,16 @@ EIGEN_DONT_INLINE static void run(
 }
 };
 
+/* Optimized row-major matrix * vector product:
+ * This algorithm processes 4 rows at onces that allows to both reduce
+ * the number of load/stores of the result by a factor 4 and to reduce
+ * the instruction dependency. Moreover, we know that all bands have the
+ * same alignment pattern.
+ *
+ * Mixing type logic:
+ *  - alpha is always a complex (or converted to a complex)
+ *  - no vectorization
+ */
 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
 struct ei_general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs>
 {
-- 
cgit v1.2.3