1 files changed, 6 insertions, 18 deletions
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index c2f0c07a8..31677e6cb 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -365,8 +365,7 @@ struct ei_product_coeff_impl<InnerVectorization, Index, Lhs, Rhs>
   }
 };
 
-// FIXME the following is a hack to get very high perf with matrix-vector product,
-// however, it would be preferable to switch for more general dynamic alignment queries
+// NOTE the following specializations are because taking .col(0) on a vector is a bit slower
 template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
 struct ei_product_coeff_vectorized_dyn_selector
 {
@@ -481,14 +480,9 @@ struct ei_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, PacketScalar, LoadMod
 ***************************************************************************/
 
 template<typename Scalar, typename RhsType>
-static void ei_cache_friendly_product(
+static void ei_cache_friendly_product_colmajor_times_vector(
   int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, Scalar* res);
 
-enum {
-  HasDirectAccess,
-  NoDirectAccess
-};
-
 template<typename ProductType,
   int LhsRows  = ei_traits<ProductType>::RowsAtCompileTime,
   int LhsOrder = int(ei_traits<ProductType>::LhsFlags)&RowMajorBit ? RowMajor : ColMajor,
@@ -507,19 +501,13 @@ struct ei_cache_friendly_product_selector
 
 // optimized colmajor * vector path
 template<typename ProductType, int LhsRows, int RhsOrder, int RhsAccess>
-struct ei_cache_friendly_product_selector<ProductType,LhsRows,NoDirectAccess,ColMajor,1,RhsOrder,RhsAccess>
+struct ei_cache_friendly_product_selector<ProductType,LhsRows,ColMajor,NoDirectAccess,1,RhsOrder,RhsAccess>
 {
-  typedef typename ei_traits<ProductType>::_LhsNested Lhs;
   template<typename DestDerived>
   inline static void run(DestDerived& res, const ProductType& product)
   {
-    ei_scalar_sum_op<typename ProductType::Scalar> _sum;
     const int size = product.rhs().rows();
     for (int k=0; k<size; ++k)
-      if (Lhs::Flags&DirectAccessBit)
-        // TODO to properly hanlde this workaround let's specialize Block for type having the DirectAccessBit
-        res += product.rhs().coeff(k) * Map<DestDerived>(&product.lhs().const_cast_derived().coeffRef(0,k),product.lhs().rows());
-      else
         res += product.rhs().coeff(k) * product.lhs().col(k);
   }
 };
@@ -527,7 +515,7 @@ struct ei_cache_friendly_product_selector<ProductType,LhsRows,NoDirectAccess,Col
 // optimized cache friendly colmajor * vector path for matrix with direct access flag
 // NOTE this path coul also be enabled for expressions if we add runtime align queries
 template<typename ProductType, int LhsRows, int RhsOrder, int RhsAccess>
-struct ei_cache_friendly_product_selector<ProductType,LhsRows,HasDirectAccess,ColMajor,1,RhsOrder,RhsAccess>
+struct ei_cache_friendly_product_selector<ProductType,LhsRows,ColMajor,HasDirectAccess,1,RhsOrder,RhsAccess>
 {
   typedef typename ProductType::Scalar Scalar;
 
@@ -545,7 +533,7 @@ struct ei_cache_friendly_product_selector<ProductType,LhsRows,HasDirectAccess,Co
       _res = (Scalar*)alloca(sizeof(Scalar)*res.size());
       Map<Matrix<Scalar,DestDerived::RowsAtCompileTime,1> >(_res, res.size()) = res;
     }
-    ei_cache_friendly_product(res.size(),
+    ei_cache_friendly_product_colmajor_times_vector(res.size(),
       &product.lhs().const_cast_derived().coeffRef(0,0), product.lhs().stride(),
       product.rhs(), _res);
 
@@ -588,7 +576,7 @@ struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,LhsAccess,RhsCo
       _res = (Scalar*)alloca(sizeof(Scalar)*res.size());
       Map<Matrix<Scalar,DestDerived::RowsAtCompileTime,1> >(_res, res.size()) = res;
     }
-    ei_cache_friendly_product(res.size(),
+    ei_cache_friendly_product_colmajor_times_vector(res.size(),
       &product.rhs().const_cast_derived().coeffRef(0,0), product.rhs().stride(),
       product.lhs().transpose(), _res);