5 files changed, 205 insertions, 84 deletions
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 678938c6b..722881215 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -313,7 +313,8 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet&
 template<size_t offset, typename Packet>
 struct protate_impl
 {
-  static Packet run(const Packet& a) { return a; }
+  // Empty so attempts to use this unimplemented path will fail to compile.
+  // Only specializations of this template should be used.
 };
 
 /** \internal \returns a packet with the coefficients rotated to the right in little-endian convention,
@@ -322,7 +323,6 @@ struct protate_impl
   */
 template<size_t offset, typename Packet> EIGEN_DEVICE_FUNC inline Packet protate(const Packet& a)
 {
-  EIGEN_STATIC_ASSERT(offset < unpacket_traits<Packet>::size, ROTATION_BY_ILLEGAL_OFFSET);
   return offset ? protate_impl<offset, Packet>::run(a) : a;
 }
 
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index e9af45f22..8dd1e1370 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -76,12 +76,12 @@ typedef uint32x4_t  Packet4ui;
 template<> struct packet_traits<float>  : default_packet_traits
 {
   typedef Packet4f type;
-  typedef Packet2f half;
+  typedef Packet4f half; // Packet2f intrinsics not implemented yet
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    HasHalfPacket=1,
+    HasHalfPacket=0, // Packet2f intrinsics not implemented yet
    
     HasDiv  = 1,
     // FIXME check the Has*
@@ -95,12 +95,12 @@ template<> struct packet_traits<float>  : default_packet_traits
 template<> struct packet_traits<int>    : default_packet_traits
 {
   typedef Packet4i type;
-  typedef Packet2i half;
+  typedef Packet4i half; // Packet2i intrinsics not implemented yet
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=4,
-    HasHalfPacket=1
+    HasHalfPacket=0 // Packet2i intrinsics not implemented yet
     // FIXME check the Has*
   };
 };
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 0890bc690..9061fd936 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -88,15 +88,15 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
   EIGEN_UNUSED_VARIABLE(num_threads);
   enum {
-    kr = 16,
+    kr = 8,
     mr = Traits::mr,
     nr = Traits::nr
   };
   k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
   if (k > kr) k -= k % kr;
-  m = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
+  m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
   if (m > mr) m -= m % mr;
-  n = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
+  n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
   if (n > nr) n -= n % nr;
   return;
 #endif
@@ -153,16 +153,104 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
   }
   else {
     // In unit tests we do not want to use extra large matrices,
-    // so we reduce the block size to check the blocking strategy is not flawed
-#ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
-    k = std::min<Index>(k,sizeof(LhsScalar)<=4 ? 360 : 240);
-    n = std::min<Index>(n,3840/sizeof(RhsScalar));
-    m = std::min<Index>(m,3840/sizeof(RhsScalar));
-#else
-    k = std::min<Index>(k,24);
-    n = std::min<Index>(n,384/sizeof(RhsScalar));
-    m = std::min<Index>(m,384/sizeof(RhsScalar));
+    // so we reduce the cache size to check the blocking strategy is not flawed
+#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+    l1 = 4*1024;
+    l2 = 32*1024;
+    l3 = 512*1024;
 #endif
+    
+    // Early return for small problems because the computation below are time consuming for small problems.
+    // Perhaps it would make more sense to consider k*n*m??
+    // Note that for very tiny problem, this function should be bypassed anyway
+    // because we use the coefficient-based implementation for them.
+    if(std::max(k,std::max(m,n))<48)
+      return;
+    
+    typedef typename Traits::ResScalar ResScalar;
+    enum {
+      k_peeling = 8,
+      k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+      k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
+    };
+    
+    // ---- 1st level of blocking on L1, yields kc ----
+    
+    // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
+    // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
+    // We also include a register-level block of the result (mx x nr).
+    // (In an ideal world only the lhs panel would stay in L1)
+    // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
+    const Index max_kc = ((l1-k_sub)/k_div) & (~(k_peeling-1));
+    const Index old_k = k;
+    if(k>max_kc)
+    {
+      // We are really blocking on the third dimension:
+      // -> reduce blocking size to make sure the last block is as large as possible
+      //    while keeping the same number of sweeps over the result.
+      k = (k%max_kc)==0 ? max_kc
+                        : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
+                        
+      eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
+    }
+    
+    // ---- 2nd level of blocking on max(L2,L3), yields nc ----
+    
+    // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
+    //      actual_l2 = max(l2, l3/nb_core_sharing_l3)
+    // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
+    // For instance, it corresponds to 6MB of L3 shared among 4 cores.
+    #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+    const Index actual_l2 = l3;
+    #else
+    const Index actual_l2 = 1572864; // == 1.5 MB
+    #endif
+    
+    
+    
+    // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
+    // The second half is implicitly reserved to access the result and lhs coefficients.
+    // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
+    // to limit this growth: we bound nc to growth by a factor x1.5, leading to:
+    const Index max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
+    // WARNING Below, we assume that Traits::nr is a power of two.
+    Index nc = std::min<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
+    if(n>nc)
+    {
+      // We are really blocking over the columns:
+      // -> reduce blocking size to make sure the last block is as large as possible
+      //    while keeping the same number of sweeps over the packed lhs.
+      //    Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
+      n = (n%nc)==0 ? nc
+                    : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
+    }
+    else if(old_k==k)
+    {
+      // So far, no blocking at all, i.e., kc==k, and nc==n.
+      // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
+      Index problem_size = k*n*sizeof(LhsScalar);
+      Index actual_lm = actual_l2;
+      Index max_mc = m;
+      if(problem_size<=1024)
+      {
+        // problem is small enough to keep in L1
+        // Let's choose m such that lhs's block fit in 1/3 of L1
+        actual_lm = l1;
+      }
+      else if(l3!=0 && problem_size<=32768)
+      {
+        // we have both L2 and L3, and problem is small enough to be kept in L2
+        // Let's choose m such that lhs's block fit in 1/3 of L2
+        actual_lm = l2;
+        max_mc = 576;
+      }
+      
+      Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
+      if (mc > Traits::mr) mc -= mc % Traits::mr;
+      
+      m = (m%mc)==0 ? mc
+                    : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
+    }
   }
 }
 
@@ -712,6 +800,80 @@ protected:
   conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
 };
 
+// helper for the rotating kernel below
+template <typename GebpKernel, bool UseRotatingKernel = GebpKernel::UseRotatingKernel>
+struct PossiblyRotatingKernelHelper
+{
+  // default implementation, not rotating
+
+  typedef typename GebpKernel::Traits Traits;
+  typedef typename Traits::RhsScalar RhsScalar;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::AccPacket AccPacket;
+
+  const Traits& traits;
+  PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
+
+
+  template <size_t K, size_t Index>
+  void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
+  {
+    traits.loadRhs(from + (Index+4*K)*Traits::RhsProgress, to);
+  }
+
+  void unrotateResult(AccPacket&,
+                      AccPacket&,
+                      AccPacket&,
+                      AccPacket&)
+  {
+  }
+};
+
+// rotating implementation
+template <typename GebpKernel>
+struct PossiblyRotatingKernelHelper<GebpKernel, true>
+{
+  typedef typename GebpKernel::Traits Traits;
+  typedef typename Traits::RhsScalar RhsScalar;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::AccPacket AccPacket;
+
+  const Traits& traits;
+  PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
+
+  template <size_t K, size_t Index>
+  void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
+  {
+    if (Index == 0) {
+      to = pload<RhsPacket>(from + 4*K*Traits::RhsProgress);
+    } else {
+      EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers");
+      to = protate<1>(to);
+    }
+  }
+
+  void unrotateResult(AccPacket& res0,
+                      AccPacket& res1,
+                      AccPacket& res2,
+                      AccPacket& res3)
+  {
+    PacketBlock<AccPacket> resblock;
+    resblock.packet[0] = res0;
+    resblock.packet[1] = res1;
+    resblock.packet[2] = res2;
+    resblock.packet[3] = res3;
+    ptranspose(resblock);
+    resblock.packet[3] = protate<1>(resblock.packet[3]);
+    resblock.packet[2] = protate<2>(resblock.packet[2]);
+    resblock.packet[1] = protate<3>(resblock.packet[1]);
+    ptranspose(resblock);
+    res0 = resblock.packet[0];
+    res1 = resblock.packet[1];
+    res2 = resblock.packet[2];
+    res3 = resblock.packet[3];
+  }
+};
+
 /* optimized GEneral packed Block * packed Panel product kernel
  *
  * Mixing type logic: C += A * B
@@ -745,6 +907,16 @@ struct gebp_kernel
     ResPacketSize = Traits::ResPacketSize
   };
 
+
+  static const bool UseRotatingKernel =
+    EIGEN_ARCH_ARM &&
+    internal::is_same<LhsScalar, float>::value &&
+    internal::is_same<RhsScalar, float>::value &&
+    internal::is_same<ResScalar, float>::value &&
+    Traits::LhsPacketSize == 4 &&
+    Traits::RhsPacketSize == 4 &&
+    Traits::ResPacketSize == 4;
+
   EIGEN_DONT_INLINE
   void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
                   Index rows, Index depth, Index cols, ResScalar alpha,
@@ -778,6 +950,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
     // Usually, make sense only with FMA
     if(mr>=3*Traits::LhsProgress)
     {
+      PossiblyRotatingKernelHelper<gebp_kernel> possiblyRotatingKernelHelper(traits);
+
       // loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth)
       for(Index i=0; i<peeled_mc3; i+=3*Traits::LhsProgress)
       {
@@ -813,43 +987,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           prefetch(&blB[0]);
           LhsPacket A0, A1;
 
-#define EIGEN_ARCH_PREFERS_ROTATING_KERNEL EIGEN_ARCH_ARM
-
-#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL
-          static const bool UseRotatingKernel =
-            Traits::LhsPacketSize == 4 &&
-            Traits::RhsPacketSize == 4 &&
-            Traits::ResPacketSize == 4;
-#endif
-
           for(Index k=0; k<peeled_kc; k+=pk)
           {
             EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
             RhsPacket B_0, T0;
             LhsPacket A2;
 
-#define EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N) \
-            traits.loadRhs(&blB[(N+4*K)*RhsProgress], B_0);
-
-#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL
-#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \
-            do { \
-              if (UseRotatingKernel) { \
-                if (N == 0) { \
-                  B_0 = pload<RhsPacket>(&blB[(0+4*K)*RhsProgress]); \
-                } else { \
-                  EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers"); \
-                  B_0 = protate<1>(B_0); \
-                } \
-              } else { \
-                EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N); \
-              } \
-            } while (false)
-#else
-#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \
-            EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N)
-#endif
-
 #define EIGEN_GEBP_ONESTEP(K) \
             do { \
               EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
@@ -859,19 +1002,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
               traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
               traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
               traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
-              EIGEN_GEBP_ONESTEP_LOADRHS(K, 0); \
+              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 0>(B_0, blB); \
               traits.madd(A0, B_0, C0, T0); \
               traits.madd(A1, B_0, C4, T0); \
               traits.madd(A2, B_0, C8, B_0); \
-              EIGEN_GEBP_ONESTEP_LOADRHS(K, 1); \
+              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 1>(B_0, blB); \
               traits.madd(A0, B_0, C1, T0); \
               traits.madd(A1, B_0, C5, T0); \
               traits.madd(A2, B_0, C9, B_0); \
-              EIGEN_GEBP_ONESTEP_LOADRHS(K, 2); \
+              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 2>(B_0, blB); \
               traits.madd(A0, B_0, C2,  T0); \
               traits.madd(A1, B_0, C6,  T0); \
               traits.madd(A2, B_0, C10, B_0); \
-              EIGEN_GEBP_ONESTEP_LOADRHS(K, 3); \
+              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 3>(B_0, blB); \
               traits.madd(A0, B_0, C3 , T0); \
               traits.madd(A1, B_0, C7,  T0); \
               traits.madd(A2, B_0, C11, B_0); \
@@ -904,34 +1047,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           }
 
 #undef EIGEN_GEBP_ONESTEP
-#undef EIGEN_GEBP_ONESTEP_LOADRHS
-#undef EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING
-
-#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL
-          if (UseRotatingKernel) {
-            #define EIGEN_GEBP_UNROTATE_RESULT(res0, res1, res2, res3) \
-              do { \
-                PacketBlock<ResPacket> resblock; \
-                resblock.packet[0] = res0; \
-                resblock.packet[1] = res1; \
-                resblock.packet[2] = res2; \
-                resblock.packet[3] = res3; \
-                ptranspose(resblock); \
-                resblock.packet[3] = protate<1>(resblock.packet[3]); \
-                resblock.packet[2] = protate<2>(resblock.packet[2]); \
-                resblock.packet[1] = protate<3>(resblock.packet[1]); \
-                ptranspose(resblock); \
-                res0 = resblock.packet[0]; \
-                res1 = resblock.packet[1]; \
-                res2 = resblock.packet[2]; \
-                res3 = resblock.packet[3]; \
-              } while (false)
-            
-            EIGEN_GEBP_UNROTATE_RESULT(C0, C1, C2, C3);
-            EIGEN_GEBP_UNROTATE_RESULT(C4, C5, C6, C7);
-            EIGEN_GEBP_UNROTATE_RESULT(C8, C9, C10, C11);
-          }
-#endif
+
+          possiblyRotatingKernelHelper.unrotateResult(C0, C1, C2, C3);
+          possiblyRotatingKernelHelper.unrotateResult(C4, C5, C6, C7);
+          possiblyRotatingKernelHelper.unrotateResult(C8, C9, C10, C11);
 
           ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index c38c12c31..c76f48154 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -164,6 +164,8 @@ static void run(Index rows, Index cols, Index depth,
 
     ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
+    
+    const bool pack_rhs_once = mc!=rows && kc==depth && nc==cols;
 
     // For each horizontal panel of the rhs, and corresponding panel of the lhs...
     for(Index i2=0; i2<rows; i2+=mc)
@@ -188,7 +190,8 @@ static void run(Index rows, Index cols, Index depth,
           // We pack the rhs's block into a sequential chunk of memory (L2 caching)
           // Note that this block will be read a very high number of times, which is equal to the number of
           // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
-          pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
+          if((!pack_rhs_once) || i2==0)
+            pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
           
           // Everything is packed, we can now call the panel * block kernel:
           gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index 5e16b775b..7538a0633 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -93,8 +93,7 @@
         THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH,
         OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG,
         IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY,
-        STORAGE_LAYOUT_DOES_NOT_MATCH,
-        ROTATION_BY_ILLEGAL_OFFSET
+        STORAGE_LAYOUT_DOES_NOT_MATCH
       };
     };