Hey, finally the copyCoeff stuff is not only used to implement swap anymore :)

Add an internal pseudo expression allowing to optimize operators like +=, *= using the copyCoeff stuff. This allows to easily enforce aligned load for the destination matrix everywhere.
author: Gael Guennebaud <g.gael@free.fr> 2009-11-20 15:39:38 +0100
committer: Gael Guennebaud <g.gael@free.fr> 2009-11-20 15:39:38 +0100
commit: eb8f4500719b52f410c545f738d8fda399cea587 (patch)
tree: 33ac90a04b6e6b3de516f2851895d92687b4db6a /Eigen/src/Core
parent: e3d890bc5a89798eff50ff6650292b4fa934f72e (diff)
7 files changed, 168 insertions, 48 deletions
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index 462e0f92d..401d57ee5 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -178,7 +178,9 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
 {
-  return *this = *this - other;
+  SelfCwiseBinaryOp<ei_scalar_difference_op<Scalar>, Derived> tmp(derived());
+  tmp = other;
+  return derived();
 }
 
 /** replaces \c *this by \c *this + \a other.
@@ -190,7 +192,9 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 {
-  return *this = *this + other;
+  SelfCwiseBinaryOp<ei_scalar_sum_op<Scalar>, Derived> tmp(derived());
+  tmp = other;
+  return derived();
 }
 
 #endif // EIGEN_CWISE_BINARY_OP_H
diff --git a/Eigen/src/Core/CwiseUnaryOps.h b/Eigen/src/Core/CwiseUnaryOps.h
index 39fd479b5..a7acd0036 100644
--- a/Eigen/src/Core/CwiseUnaryOps.h
+++ b/Eigen/src/Core/CwiseUnaryOps.h
@@ -33,9 +33,17 @@ EIGEN_STRONG_INLINE const CwiseUnaryOp<ei_scalar_opposite_op<typename ei_traits<
 operator-() const { return derived(); }
 
 EIGEN_STRONG_INLINE Derived& operator*=(const Scalar& other)
-{ return *this = *this * other; }
+{
+  SelfCwiseBinaryOp<ei_scalar_product_op<Scalar>, Derived> tmp(derived());
+  tmp = PlainMatrixType::Constant(rows(),cols(),other);
+  return derived();
+}
 EIGEN_STRONG_INLINE Derived& operator/=(const Scalar& other)
-{ return *this = *this / other; }
+{
+  SelfCwiseBinaryOp<typename ei_meta_if<NumTraits<Scalar>::HasFloatingPoint,ei_scalar_product_op<Scalar>,ei_scalar_quotient_op<Scalar> >::ret, Derived> tmp(derived());
+  tmp = PlainMatrixType::Constant(rows(),cols(), NumTraits<Scalar>::HasFloatingPoint ? Scalar(1)/other : other);
+  return derived();
+}
 
 /** \returns an expression of \c *this scaled by the scalar factor \a scalar */
 EIGEN_STRONG_INLINE const ScalarMultipleReturnType
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 77e5641ff..ae1720eca 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -34,6 +34,22 @@
   * of generic vectorized code.
   */
 
+#ifndef EIGEN_DEBUG_ALIGNED_LOAD
+#define EIGEN_DEBUG_ALIGNED_LOAD
+#endif
+
+#ifndef EIGEN_DEBUG_UNALIGNED_LOAD
+#define EIGEN_DEBUG_UNALIGNED_LOAD
+#endif
+
+#ifndef EIGEN_DEBUG_ALIGNED_STORE
+#define EIGEN_DEBUG_ALIGNED_STORE
+#endif
+
+#ifndef EIGEN_DEBUG_UNALIGNED_STORE
+#define EIGEN_DEBUG_UNALIGNED_STORE
+#endif
+
 struct ei_default_packet_traits
 {
   enum {
@@ -44,13 +60,13 @@ struct ei_default_packet_traits
     HasAbs    = 1,
     HasMin    = 1,
     HasMax    = 1,
-    
+
     HasDiv    = 0,
     HasSqrt   = 0,
     HasExp    = 0,
     HasLog    = 0,
     HasPow    = 0,
-    
+
     HasSin    = 0,
     HasCos    = 0,
     HasTan    = 0,
@@ -128,7 +144,7 @@ ei_pxor(const Packet& a, const Packet& b) { return a ^ b; }
 /** \internal \returns the bitwise andnot of \a a and \a b */
 template<typename Packet> inline Packet
 ei_pandnot(const Packet& a, const Packet& b) { return a & (!b); }
-        
+
 /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
 template<typename Scalar> inline typename ei_packet_traits<Scalar>::type
 ei_pload(const Scalar* from) { return *from; }
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index 8770732de..0f0986bc5 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -34,13 +34,13 @@
   * and type \c AlignedDerivedType in their respective ei_traits<> specialization structure.
   * The value of \c PacketAccess can be either \b AsRequested, or set to \b EnforceAlignedAccess which
   * enforces both aligned loads and stores.
-  * 
-  * \c EnforceAlignedAccess is automatically set in expressions such as 
+  *
+  * \c EnforceAlignedAccess is automatically set in expressions such as
   * \code A += B; \endcode where A is either a Block or a Map. Here,
   * this expression is transfomed into \code A = A_with_EnforceAlignedAccess + B; \endcode
   * avoiding unaligned loads from A. Indeed, since Eigen's packet evaluation mechanism
   * automatically align to the destination matrix, we know that loads to A will be aligned too.
-  * 
+  *
   * The type \c AlignedDerivedType should correspond to the equivalent expression type
   * with \c PacketAccess set to \c EnforceAlignedAccess.
   *
@@ -197,32 +197,6 @@ template<typename Derived> class MapBase
     using Base::operator=;
     using Base::operator*=;
 
-    // FIXME it seems VS does not allow to do "using Base::operator+="
-    // and to overload operator+= at the same time, therefore we have to
-    // explicitly add these two overloads.
-    // Maybe there exists a better solution though.
-    template<typename ProductDerived, typename Lhs,typename Rhs>
-    Derived& operator+=(const Flagged<ProductBase<ProductDerived,Lhs,Rhs>, 0, EvalBeforeAssigningBit>& other)
-    { return Base::operator+=(other); }
-
-    template<typename ProductDerived, typename Lhs,typename Rhs>
-    Derived& operator-=(const Flagged<ProductBase<ProductDerived,Lhs,Rhs>, 0, EvalBeforeAssigningBit>& other)
-    { return Base::operator-=(other); }
-
-    template<typename OtherDerived>
-    Derived& operator+=(const MatrixBase<OtherDerived>& other)
-    { return derived() = forceAligned() + other; }
-
-    template<typename OtherDerived>
-    Derived& operator-=(const MatrixBase<OtherDerived>& other)
-    { return derived() = forceAligned() - other; }
-
-    Derived& operator*=(const Scalar& other)
-    { return derived() = forceAligned() * other; }
-
-    Derived& operator/=(const Scalar& other)
-    { return derived() = forceAligned() / other; }
-
   protected:
 
     void checkDataAlignment() const
@@ -230,7 +204,7 @@ template<typename Derived> class MapBase
       ei_assert( ((!(ei_traits<Derived>::Flags&AlignedBit))
                   || ((std::size_t(m_data)&0xf)==0)) && "data is not aligned");
     }
-    
+
     const Scalar* EIGEN_RESTRICT m_data;
     const ei_int_if_dynamic<RowsAtCompileTime> m_rows;
     const ei_int_if_dynamic<ColsAtCompileTime> m_cols;
diff --git a/Eigen/src/Core/SelfCwiseBinaryOp.h b/Eigen/src/Core/SelfCwiseBinaryOp.h
new file mode 100644
index 000000000..ac0ccd963
--- /dev/null
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h
@@ -0,0 +1,113 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_SELFCWISEBINARYOP_H
+#define EIGEN_SELFCWISEBINARYOP_H
+
+/** \class SelfCwiseBinaryOp
+  *
+  * \internal
+  *
+  * \brief Internal helper class for optimizing operators like +=, -=
+  */
+template<typename BinaryOp, typename MatrixType>
+struct ei_traits<SelfCwiseBinaryOp<BinaryOp,MatrixType> > : ei_traits<MatrixType> {};
+
+template<typename BinaryOp, typename MatrixType> class SelfCwiseBinaryOp
+  : public MatrixBase<SelfCwiseBinaryOp<BinaryOp,MatrixType> >
+{
+  public:
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(SelfCwiseBinaryOp)
+    typedef typename ei_packet_traits<Scalar>::type Packet;
+
+    using Base::operator=;
+
+    inline SelfCwiseBinaryOp(MatrixType& xpr, const BinaryOp& func = BinaryOp()) : m_matrix(xpr), m_functor(func) {}
+
+    inline int rows() const { return m_matrix.rows(); }
+    inline int cols() const { return m_matrix.cols(); }
+    inline int stride() const { return m_matrix.stride(); }
+
+    // note that this function is needed by assign to correctly align loads/stores
+    // TODO make Assign use .data()
+    inline Scalar& coeffRef(int row, int col)
+    {
+      return m_matrix.const_cast_derived().coeffRef(row, col);
+    }
+
+    // note that this function is needed by assign to correctly align loads/stores
+    // TODO make Assign use .data()
+    inline Scalar& coeffRef(int index)
+    {
+      return m_matrix.const_cast_derived().coeffRef(index);
+    }
+
+    template<typename OtherDerived>
+    void copyCoeff(int row, int col, const MatrixBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      ei_internal_assert(row >= 0 && row < rows()
+                         && col >= 0 && col < cols());
+      Scalar& tmp = m_matrix.coeffRef(row,col);
+      tmp = m_functor(tmp, _other.coeff(row,col));
+    }
+
+    template<typename OtherDerived>
+    void copyCoeff(int index, const MatrixBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      ei_internal_assert(index >= 0 && index < m_matrix.size());
+      Scalar& tmp = m_matrix.coeffRef(index);
+      tmp = m_functor(tmp, _other.coeff(index));
+    }
+
+    template<typename OtherDerived, int StoreMode, int LoadMode>
+    void copyPacket(int row, int col, const MatrixBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      ei_internal_assert(row >= 0 && row < rows()
+                        && col >= 0 && col < cols());
+      m_matrix.template writePacket<StoreMode>(row, col,
+        m_functor.packetOp(m_matrix.template packet<StoreMode>(row, col),_other.template packet<LoadMode>(row, col)) );
+    }
+
+    template<typename OtherDerived, int StoreMode, int LoadMode>
+    void copyPacket(int index, const MatrixBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      ei_internal_assert(index >= 0 && index < m_matrix.size());
+      m_matrix.template writePacket<StoreMode>(index,
+        m_functor.packetOp(m_matrix.template packet<StoreMode>(index),_other.template packet<LoadMode>(index)) );
+    }
+
+  protected:
+    MatrixType& m_matrix;
+    const BinaryOp& m_functor;
+
+  private:
+    SelfCwiseBinaryOp& operator=(const SelfCwiseBinaryOp&);
+};
+
+#endif // EIGEN_SELFCWISEBINARYOP_H
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 60ccadc21..29c89c310 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -172,14 +172,14 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a,
 template<> EIGEN_STRONG_INLINE Packet2d ei_pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float*    from) { return _mm_load_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet2d ei_pload<double>(const double*  from) { return _mm_load_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float*    from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pload<double>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
 
 #if (!defined __GNUC__) && (!defined __ICC)
-template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float*   from) { return _mm_loadu_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double*  from) { return _mm_loadu_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float*   from) { return EIGEN_DEBUG_UNALIGNED_LOAD _mm_loadu_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double*  from) { return EIGEN_DEBUG_UNALIGNED_LOAD _mm_loadu_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { return EIGEN_DEBUG_UNALIGNED_LOAD _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
 #else
 // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
 // require pointer casting to incompatible pointer types and leads to invalid code
@@ -188,6 +188,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { return
 // TODO: do the same for MSVC (ICC is compatible)
 template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
 {
+  EIGEN_DEBUG_UNALIGNED_LOAD
   __m128 res;
   asm volatile ("movsd  %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from), [dummy] "m" (*(from+1)) );
   asm volatile ("movhps %[from2], %[r]" : [r] "+x" (res) : [from2] "m" (*(from+2)), [dummy] "m" (*(from+3)) );
@@ -195,6 +196,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
 }
 template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from)
 {
+  EIGEN_DEBUG_UNALIGNED_LOAD
   __m128d res;
   asm volatile ("movsd  %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from) );
   asm volatile ("movhpd %[from1], %[r]" : [r] "+x" (res) : [from1] "m" (*(from+1)) );
@@ -202,6 +204,7 @@ template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from)
 }
 template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)
 {
+  EIGEN_DEBUG_UNALIGNED_LOAD
   __m128i res;
   asm volatile ("movsd  %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from), [dummy] "m" (*(from+1)) );
   asm volatile ("movhps %[from2], %[r]" : [r] "+x" (res) : [from2] "m" (*(from+2)), [dummy] "m" (*(from+3)) );
@@ -209,16 +212,17 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)
 }
 #endif
 
-template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float*   to, const Packet4f& from) { _mm_store_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void ei_pstore<double>(double* to, const Packet2d& from) { _mm_store_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int*       to, const Packet4i& from) { _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }
+template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void ei_pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }
 
 template<> EIGEN_STRONG_INLINE void ei_pstoreu<double>(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE
   _mm_storel_pd((to), from);
   _mm_storeh_pd((to+1), from);
 }
-template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float*  to, const Packet4f& from) { ei_pstoreu((double*)to, _mm_castps_pd(from)); }
-template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int*      to, const Packet4i& from) { ei_pstoreu((double*)to, _mm_castsi128_pd(from)); }
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((double*)to, _mm_castps_pd(from)); }
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((double*)to, _mm_castsi128_pd(from)); }
 
 #if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64)
 // The temporary variable fixes an internal compilation error.
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 541b5dd9f..fb0233591 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -48,6 +48,7 @@ template<typename NullaryOp, typename MatrixType>         class CwiseNullaryOp;
 template<typename UnaryOp,   typename MatrixType>         class CwiseUnaryOp;
 template<typename ViewOp,    typename MatrixType>         class CwiseUnaryView;
 template<typename BinaryOp,  typename Lhs, typename Rhs>  class CwiseBinaryOp;
+template<typename BinOp, typename MatrixType>             class SelfCwiseBinaryOp;
 template<typename Derived,   typename Lhs, typename Rhs>  class ProductBase;
 
 template<typename Derived> class DiagonalBase;
author	Gael Guennebaud <g.gael@free.fr>	2009-11-20 15:39:38 +0100
committer	Gael Guennebaud <g.gael@free.fr>	2009-11-20 15:39:38 +0100
commit	eb8f4500719b52f410c545f738d8fda399cea587 (patch)
tree	33ac90a04b6e6b3de516f2851895d92687b4db6a /Eigen/src/Core
parent	e3d890bc5a89798eff50ff6650292b4fa934f72e (diff)